\documentclass[landscape,12pt,titlepage,letterpaper,oneside,final,fleqn]{article} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \usepackage{amsfonts} \usepackage{amsmath} \setcounter{MaxMatrixCols}{10} %TCIDATA{OutputFilter=LATEX.DLL} %TCIDATA{Version=5.00.0.2570} %TCIDATA{} %TCIDATA{Created=Monday, March 15, 2004 08:06:32} %TCIDATA{LastRevised=Thursday, March 29, 2007 10:19:25} %TCIDATA{} %TCIDATA{} %TCIDATA{CSTFile=Overhead Transparencies.cst} %TCIDATA{PageSetup=72,72,72,72,1} %TCIDATA{Counters=arabic,1} %TCIDATA{AllPages= %H=36 %F=36 %} \newtheorem{theorem}{Theorem} \newtheorem{acknowledgement}[theorem]{Acknowledgement} \newtheorem{algorithm}[theorem]{Algorithm} \newtheorem{axiom}[theorem]{Axiom} \newtheorem{case}[theorem]{Case} \newtheorem{claim}[theorem]{Claim} \newtheorem{conclusion}[theorem]{Conclusion} \newtheorem{condition}[theorem]{Condition} \newtheorem{conjecture}[theorem]{Conjecture} \newtheorem{corollary}[theorem]{Corollary} \newtheorem{criterion}[theorem]{Criterion} \newtheorem{definition}[theorem]{Definition} \newtheorem{example}[theorem]{Example} \newtheorem{exercise}[theorem]{Exercise} \newtheorem{lemma}[theorem]{Lemma} \newtheorem{notation}[theorem]{Notation} \newtheorem{problem}[theorem]{Problem} \newtheorem{proposition}[theorem]{Proposition} \newtheorem{remark}[theorem]{Remark} \newtheorem{solution}[theorem]{Solution} \newtheorem{summary}[theorem]{Summary} \newenvironment{proof}[Proof]{\noindent\textbf{#1.} }{\ \rule{0.5em}{0.5em}} \input{tcilatex} \begin{document} \title{Multiple Imputation, I} \author{John M. Abowd} \date{March, 2007} \maketitle \section{Models, Parameters, Bayes Law} \subsubsection{Symmetric treatment of data and parameters (joint distribution of the data and the parameters)% \protect\begin{equation*} p\left( Y,\protect\theta \right) \protect\end{equation*}% } \subsubsection{Two factorizations} Likelihood and prior parameters: \begin{equation*} p\left( Y,\theta \right) =p\left( Y\left\vert \theta \right. \right) p\left( \theta \right) \end{equation*} where the first factor, $p\left( Y\left\vert \theta \right. \right) \equiv \ell \left( \theta \left\vert Y\right. \right)$ is the data model (or likelihood function) and the second factor $p\left( \theta \right)$ is the prior distribution of the parameters. Alternatively, posterior parameters and marginal distribution of the data:% \begin{equation*} p\left( Y,\theta \right) =p\left( \theta \left\vert Y\right. \right) p\left( Y\right) \end{equation*} where $p\left( \theta \left\vert Y\right. \right)$ is the posterior distribution of the parameters (note that $p\left( \theta \left\vert Y\right. \right)$ is not the same function as $\ell \left( \theta \left\vert Y\right. \right)$) and $p\left( Y\right)$ is the marginal (predictive) distribution of the data. By direct maniupulation, we can derive the posterior distribution of the parameters% \begin{equation*} p\left( \theta \left\vert Y\right. \right) =\frac{p\left( Y,\theta \right) }{% p\left( Y\right) }=\frac{p\left( Y\left\vert \theta \right. \right) p\left( \theta \right) }{p\left( Y\right) }\equiv \frac{\ell \left( \theta \left\vert Y\right. \right) p\left( \theta \right) }{p\left( Y\right) }. \end{equation*} \pagebreak \section{Bayes Rule with Missing Data} Now, consider what happens if we observe some of the data but not other parts. Partition $Y$ as% \begin{equation*} Y=\left( Y_{mis},Y_{obs}\right) \end{equation*}% and let the matrix $M$ be defined by% \begin{equation*} m_{ij}=\left\{ \begin{array}{l} 1,\text{ if }y_{ij}\text{ is missing} \\ 0,\text{ otherwise}% \end{array}% \right. \end{equation*}% Then, the joint distribution of interest is% \begin{equation*} p\left( Y_{mis},Y_{obs},M,\theta ,\psi \right) =p\left( Y_{mis},Y_{obs},M\left\vert \theta ,\psi \right. \right) p\left( \theta ,\psi \right) \end{equation*}% Decomposing leads to% \begin{equation*} p\left( Y_{mis},Y_{obs},M\left\vert \theta ,\psi \right. \right) =p\left( M\left\vert Y_{mis},Y_{obs},\psi \right. \right) p\left( Y_{mis},Y_{obs}\left\vert \theta \right. \right) \end{equation*}% where the parameters $\psi$ are associated with the missing data mechanism and the parmeters $\theta$ are associated with the complete data model (or likelihood function). There can be a functional dependence between $\psi$ and $\theta$, for example $\left( \theta ,\psi \right) =\left( \theta \left( \beta \right) ,\psi \left( \beta \right) \right)$; so, there is no loss of generality in this notation. The distribution $p\left( M\left\vert Y_{mis},Y_{obs},\psi \right. \right)$ is called the missing data generating mechanism. The distribution $p\left( Y_{mis},Y_{obs}\left\vert \theta \right. \right)$ is called the complete data model. \pagebreak \section{Ignorability (Likelihood based)} To do inference in the presence of $Y_{mis}$ consider each necessary piece% \begin{align*} p\left( Y_{obs},M\left\vert \theta ,\psi \right. \right) & =\dint p\left( M\left\vert Y_{mis},Y_{obs},\psi \right. \right) p\left( Y_{mis},Y_{obs}\left\vert \theta \right. \right) dY_{mis} \\ & =p\left( M\left\vert Y_{obs},\psi \right. \right) \dint p\left( Y_{mis},Y_{obs}\left\vert \theta \right. \right) dY_{mis} \end{align*} Likelihood-based ignorability conditions hold: \begin{itemize} \item Data missing at random (MAR) $p\left( M\left\vert Y_{mis},Y_{obs},\psi \right. \right) =p\left( M\left\vert Y_{obs},\psi \right. \right) .$ \item Missing data and model parameters are distinct $\left( \theta ,\psi \right) \neq \left( \theta \left( \beta \right) ,\psi \left( \beta \right) \right)$ for some lower dimensional space $\beta .$ \end{itemize} Bayesian ignorability conditions: \begin{itemize} \item Data missing at random (MAR) $p\left( M\left\vert Y_{mis},Y_{obs},\psi \right. \right) =p\left( M\left\vert Y_{obs},\psi \right. \right) .$ \item The prior distribution of $\left( \theta ,\psi \right)$ factors into $% p\left( \theta \right) p\left( \psi \right) .$ \end{itemize} \pagebreak \section{Imputing Missing Data} Multiple imputation is based on the posterior predictive distribution of $% Y_{mis}$. \begin{equation*} p\left( Y_{mis}\left\vert Y_{obs},M,\theta ,\psi \right. \right) =\frac{% p\left( M\left\vert Y_{mis},Y_{obs},\psi \right. \right) p\left( Y_{mis},Y_{obs}\left\vert \theta \right. \right) }{p\left( Y_{obs},M\left\vert \theta ,\psi \right. \right) } \end{equation*}% Integrating out the parameters yields% \begin{equation*} p\left( Y_{mis}\left\vert Y_{obs},M\right. \right) =\diint \frac{p\left( M\left\vert Y_{mis},Y_{obs},\psi \right. \right) p\left( Y_{mis},Y_{obs}\left\vert \theta \right. \right) }{p\left( Y_{obs},M\left\vert \theta ,\psi \right. \right) }p\left( \theta ,\psi \left\vert Y_{obs},M\right. \right) d\theta d\psi \end{equation*} \pagebreak \section{Operationalizing the Distribution} This distribution is the basis for multiple imputation of missing data. To be useful it must be operationalized considering the same kinds of assumptions that are discussed above. Imposing MAR and Bayesian ignorability yields:% \begin{align*} p\left( Y_{mis}\left\vert Y_{obs},M\right. \right) & =E_{\left( \theta ,\psi |Y_{obs},M\right) }\left[ \frac{\text{Joint Distribution}\left( Y_{mis},Y_{obs},M\right) }{\text{Joint Distribution}\left( Y_{obs},M\right) }% \right] \\ & =\diint \left[ \frac{p\left( M\left\vert Y_{mis},Y_{obs},\psi \right. \right) p\left( Y_{mis},Y_{obs}\left\vert \theta \right. \right) }{p\left( Y_{obs},M\left\vert \theta ,\psi \right. \right) }\right] p\left( \theta ,\psi \left\vert Y_{obs},M\right. \right) d\theta d\psi \\ & =\diint \frac{p\left( M\left\vert Y_{obs},\psi \right. \right) p\left( Y_{mis},Y_{obs}\left\vert \theta \right. \right) }{p\left( M\left\vert Y_{obs},\psi \right. \right) \dint p\left( Y_{mis},Y_{obs}\left\vert \theta \right. \right) dY_{mis}}p\left( \theta \left\vert Y_{obs}\right. \right) p\left( \psi \left\vert Y_{obs},M\right. \right) d\theta d\psi \\ & =\dint \frac{p\left( Y_{mis},Y_{obs}\left\vert \theta \right. \right) }{% \dint p\left( Y_{mis},Y_{obs}\left\vert \theta \right. \right) dY_{mis}}% p\left( \theta \left\vert Y_{obs}\right. \right) d\theta \dint p\left( \psi \left\vert Y_{obs},M\right. \right) d\psi \\ & =p\left( Y_{mis}\left\vert Y_{obs}\right. \right) \end{align*} \pagebreak \section{Details} \begin{equation*} p\left( Y_{obs},M\left\vert \theta ,\psi \right. \right) =p\left( M\left\vert Y_{obs},\psi \right. \right) \dint p\left( Y_{mis},Y_{obs}\left\vert \theta \right. \right) dY_{mis} \end{equation*}% \begin{equation*} p\left( Y_{mis}\left\vert Y_{obs}\right. \theta \right) =\frac{p\left( Y_{mis},Y_{obs}\left\vert \theta \right. \right) }{\dint p\left( Y_{mis},Y_{obs}\left\vert \theta \right. \right) dY_{mis}} \end{equation*}% and% \begin{equation*} \dint p\left( \psi \left\vert Y_{obs},M\right. \right) d\psi =1 \end{equation*}% Note, there are other ways to operationalize the posterior predictive distribution so that ignorability does not have to be imposed. \pagebreak \section{The Bayesian Bootstrap Mulitple Imputation Procedure} Let $\left( Y_{obs},Y_{mis}\right)$ be partitioned such that observations $% 1,\ldots ,n_{obs}$ correspond to the data in $Y_{obs}$ and $n_{obs}+1,\ldots ,n$ correspond to the data in $Y_{mis}.$ Algorithm: \begin{itemize} \item Draw $n_{obs}-1$ random numbers from $U\left( 0,1\right)$. Sorted from lowest to highest labeled $a_{1},\ldots a_{n_{obs}-1}.$ Define $a_{0}=1$ and $a_{n_{obs}}=1.$ \item Impute each of $n-n_{obs}=n_{mis}$ values of $Y_{mis}$ by sampling with replacement from $Y_{obs}$ using the probabilities $\left( a_{1}-a_{0}\right) ,\ldots ,\left( a_{n_{obs}}-a_{n_{obs}-1}\right)$. I.e., draw $n_{mis}$ random numbers from $U\left( 0,1\right)$; impute $Y_{i}$ when \$a_{i-1}