Difference between revisions of "Estimation of the observed Fisher information matrix"

From Popix
Jump to navigation Jump to search
m (References)
m
 
(35 intermediate revisions by 2 users not shown)
Line 1: Line 1:
<div style="font-size:12pt; font-family:garamond">
+
$ \def\hphi{\tilde{\phi}} $
<!-- some LaTeX macros we want to use: -->
+
==Estimation using stochastic approximation==
$
 
<!-- Methods macros -->
 
\def\ieta{m}
 
\def\Meta{M}
 
\def\imh{\ell}
 
\def\ceta{\tilde{\eta}}
 
\def\cpsi{\tilde{\psi}}
 
\def\transpose{t}
 
\newcommand{\Dt}[1]{\partial_\theta #1}
 
\newcommand{\DDt}[1]{\partial^2_{\theta} #1}
 
\newcommand{\cov}[1]{\mbox{Cov}\left(#1\right)}
 
\def\jparam{j}
 
\newcommand{\Dphi}[1]{\partial_\phi #1}
 
\newcommand{\Dpsi}[1]{\partial_\psi #1}
 
\def\llike{\cal LL}
 
<!-- .......... -->
 
  
\newcommand{\argmin}[1]{ \mathop{\rm arg} \mathop{\rm min}\limits_{#1} }
+
The ''observed'' Fisher information matrix (F.I.M.) is a function of $\theta$ defined as
\newcommand{\argmax}[1]{ \mathop{\rm arg} \mathop{\rm max}\limits_{#1} }
 
\newcommand{\nominal}[1]{#1^{\star}}
 
\newcommand{\psis}{\psi{^\star}}
 
\newcommand{\phis}{\phi{^\star}}
 
\newcommand{\hpsi}{\hat{\psi}}
 
\newcommand{\hphi}{\hat{\phi}}
 
\newcommand{\teps}{\varepsilon}
 
\newcommand{\limite}[2]{\mathop{\longrightarrow}\limits_{\mathrm{#1}}^{\mathrm{#2}}}
 
\newcommand{\DDt}[1]{\partial^2_\theta #1}
 
 
 
\def\cpop{c_{\rm pop}}
 
\def\Vpop{V_{\rm pop}}
 
\def\iparam{l}
 
\newcommand{\trcov}[1]{#1}
 
 
 
\def\bu{\boldsymbol{u}}
 
\def\bt{\boldsymbol{t}}
 
\def\bT{\boldsymbol{T}}
 
\def\by{\boldsymbol{y}}
 
\def\bx{\boldsymbol{x}}
 
\def\bc{\boldsymbol{c}}
 
\def\bw{\boldsymbol{w}}
 
\def\bz{\boldsymbol{z}}
 
\def\bpsi{\boldsymbol{\psi}}
 
\def\bbeta{\beta}
 
 
 
 
 
\def\aref{a^\star}
 
\def\kref{k^\star}
 
\def\model{M}
 
\def\hmodel{m}
 
\def\mmodel{\mu}
 
\def\imodel{H}
 
\def\thmle{\hat{\theta}}
 
\def\ofim{I^{\rm obs}}
 
\def\efim{I^{\star}}
 
 
 
\def\Imax{\rm Imax}
 
\def\probit{\rm probit}
 
\def\vt{t}
 
\def\id{\rm Id}
 
\def\teta{\tilde{\eta}}
 
\newcommand{\eqdef}{\mathop{=}\limits^{\mathrm{def}}}
 
 
 
\newcommand{\deriv}[1]{\frac{d}{dt}#1(t)}
 
 
 
\newcommand{\pred}[1]{\tilde{#1}}
 
\def\phis{\phi{^\star}}
 
\def\hphi{\tilde{\phi}}
 
\def\hw{\tilde{w}}
 
\def\hpsi{\tilde{\psi}}
 
\def\hatpsi{\hat{\psi}}
 
\def\hatphi{\hat{\phi}}
 
\def\psis{\psi{^\star}}
 
\def\transy{u}
 
\def\psipop{\psi_{\rm pop}}
 
\newcommand{\psigr}[1]{\hat{\bpsi}_{#1}}
 
\newcommand{\Vgr}[1]{\hat{V}_{#1}}
 
 
 
\def\pmacro{\text{p}}
 
\def\py{\pmacro}
 
\def\pt{\pmacro}
 
\def\pc{\pmacro}
 
\def\pu{\pmacro}
 
\def\pyi{\pmacro}
 
\def\pyj{\pmacro}
 
\def\ppsi{\pmacro}
 
\def\ppsii{\pmacro}
 
\def\pcpsith{\pmacro}
 
\def\pcpsiiyi{\pmacro}
 
\def\pth{\pmacro}
 
\def\pypsi{\pmacro}
 
\def\pcypsi{\pmacro}
 
\def\ppsic{\pmacro}
 
\def\pcpsic{\pmacro}
 
\def\pypsic{\pmacro}
 
\def\pypsit{\pmacro}
 
\def\pcypsit{\pmacro}
 
\def\pypsiu{\pmacro}
 
\def\pcypsiu{\pmacro}
 
\def\pypsith{\pmacro}
 
\def\pypsithcut{\pmacro}
 
\def\pypsithc{\pmacro}
 
\def\pcypsiut{\pmacro}
 
\def\pcpsithc{\pmacro}
 
\def\pcthy{\pmacro}
 
\def\pyth{\pmacro}
 
\def\pcpsiy{\pmacro}
 
\def\pz{\pmacro}
 
\def\pw{\pmacro}
 
\def\pcwz{\pmacro}
 
\def\pw{\pmacro}
 
\def\pcyipsii{\pmacro}
 
\def\pyipsii{\pmacro}
 
\def\pcetaiyi{\pmacro}
 
\def\pypsiij{\pmacro}
 
\def\pyipsiONE{\pmacro}
 
\def\ptypsiij{\pmacro}
 
\def\pcyzipsii{\pmacro}
 
\def\pczipsii{\pmacro}
 
\def\pcyizpsii{\pmacro}
 
\def\pcyijzpsii{\pmacro}
 
\def\pcyiONEzpsii{\pmacro}
 
\def\pcypsiz{\pmacro}
 
\def\pccypsiz{\pmacro}
 
\def\pypsiz{\pmacro}
 
\def\pcpsiz{\pmacro}
 
\def\peps{\pmacro}
 
\def\petai{\pmacro}
 
 
 
\def\psig{\psi}
 
\def\psigprime{\psig^{\prime}}
 
\def\psigiprime{\psig_i^{\prime}}
 
\def\psigk{\psig^{(k)}}
 
\def\psigki{\psig_i^{(k)}}
 
\def\psigkun{\psig^{(k+1)}}
 
\def\psigkuni{\psig_i^{(k+1)}}
 
\def\psigi{\psig_i}
 
\def\psigil{\psig_{i,\ell}}
 
\def\phig{\phi}
 
\def\phigi{\phig_i}
 
\def\phigil{\phig_{i,\ell}}
 
 
 
 
 
\def\etagi{\eta_i}
 
\def\IIV{\Omega}
 
\def\thetag{\theta}
 
\def\thetagk{\theta_k}
 
\def\thetagkun{\theta_{k+1}}
 
\def\thetagkunm{\theta_{k-1}}
 
\def\sgk{s_{k}}
 
\def\sgkun{s_{k+1}}
 
\def\yg{y}
 
\def\xg{x}
 
 
 
\def\qx{p_x}
 
\def\qy{p_y}
 
\def\qt{p_t}
 
\def\qc{p_c}
 
\def\qu{p_u}
 
\def\qyi{p_{y_i}}
 
\def\qyj{p_{y_j}}
 
\def\qpsi{p_{\psi}}
 
\def\qpsii{p_{\psi_i}}
 
\def\qcpsith{p_{\psi|\theta}}
 
\def\qth{p_{\theta}}
 
\def\qypsi{p_{y,\psi}}
 
\def\qcypsi{p_{y|\psi}}
 
\def\qpsic{p_{\psi,c}}
 
\def\qcpsic{p_{\psi|c}}
 
\def\qypsic{p_{y,\psi,c}}
 
\def\qypsit{p_{y,\psi,t}}
 
\def\qcypsit{p_{y|\psi,t}}
 
\def\qypsiu{p_{y,\psi,u}}
 
\def\qcypsiu{p_{y|\psi,u}}
 
\def\qypsith{p_{y,\psi,\theta}}
 
\def\qypsithcut{p_{y,\psi,\theta,c,u,t}}
 
\def\qypsithc{p_{y,\psi,\theta,c}}
 
\def\qcypsiut{p_{y|\psi,u,t}}
 
\def\qcpsithc{p_{\psi|\theta,c}}
 
\def\qcthy{p_{\theta | y}}
 
\def\qyth{p_{y,\theta}}
 
\def\qcpsiy{p_{\psi|y}}
 
\def\qcpsiiyi{p_{\psi_i|y_i}}
 
\def\qcetaiyi{p_{\eta_i|y_i}}
 
\def\qz{p_z}
 
\def\qw{p_w}
 
\def\qcwz{p_{w|z}}
 
\def\qw{p_w}
 
\def\qcyipsii{p_{y_i|\psi_i}}
 
\def\qyipsii{p_{y_i,\psi_i}}
 
\def\qypsiij{p_{y_{ij}|\psi_{i}}}
 
\def\qyipsi1{p_{y_{i1}|\psi_{i}}}
 
\def\qtypsiij{p_{\transy(y_{ij})|\psi_{i}}}
 
\def\qcyzipsii{p_{z_i,y_i|\psi_i}}
 
\def\qczipsii{p_{z_i|\psi_i}}
 
\def\qcyizpsii{p_{y_i|z_i,\psi_i}}
 
\def\qcyijzpsii{p_{y_{ij}|z_{ij},\psi_i}}
 
\def\qcyi1zpsii{p_{y_{i1}|z_{i1},\psi_i}}
 
\def\qcypsiz{p_{y,\psi|z}}
 
\def\qccypsiz{p_{y|\psi,z}}
 
\def\qypsiz{p_{y,\psi,z}}
 
\def\qcpsiz{p_{\psi|z}}
 
\def\qeps{p_{\teps}}
 
\def\qetai{p_{\eta_i}}
 
 
 
\def\neta{n_\eta}
 
\def\ncov{M}
 
\def\npsi{n_\psig}
 
 
 
 
 
\def\beeta{\eta}
 
 
 
\def\logit{\rm logit}
 
\def\transy{u}
 
\def\so{O}
 
 
 
\newcommand{\prob}[1]{ \mathbb{P}\left(#1\right)}
 
\newcommand{\probs}[2]{ \mathbb{P}_{#1}\left(#2\right)}
 
\newcommand{\esp}[1]{\mathbb{E}\left(#1\right)}
 
\newcommand{\esps}[2]{\mathbb{E}_{#1}\left(#2\right)}
 
\newcommand{\var}[1]{\mbox{Var}\left(#1\right)}
 
\newcommand{\vars}[2]{\mbox{Var}_{#1}\left(#2\right)}
 
\newcommand{\std}[1]{\mbox{sd}\left(#1\right)}
 
\newcommand{\stds}[2]{\mbox{sd}_{#1}\left(#2\right)}
 
\newcommand{\corr}[1]{\mbox{Corr}\left(#1\right)}
 
\newcommand{\Rset}{\mbox{$\mathbb{R}$}}
 
\newcommand{\Yr}{\mbox{$\mathcal{Y}$}}
 
\newcommand{\teps}{\varepsilon}
 
\newcommand{\like}{\cal L}
 
\newcommand{\logit}{\rm logit}
 
\newcommand{\transy}{u}
 
\newcommand{\repy}{y^{(r)}}
 
\newcommand{\brepy}{\boldsymbol{y}^{(r)}}
 
\newcommand{\vari}[3]{#1_{#2}^{{#3}}}
 
\newcommand{\dA}[2]{\dot{#1}_{#2}(t)}
 
\newcommand{\nitc}{N}
 
\newcommand{\itc}{I}
 
\newcommand{\vl}{V}
 
\newcommand{tstart}{t_{start}}
 
\newcommand{tstop}{t_{stop}}
 
\newcommand{\one}{\mathbb{1}}
 
\newcommand{\hazard}{h}
 
\newcommand{\cumhaz}{H}
 
\newcommand{\std}[1]{\mbox{sd}\left(#1\right)}
 
\newcommand{\eqdef}{\mathop{=}\limits^{\mathrm{def}}}
 
 
 
\def\mlxtran{\text{MLXtran}}
 
\def\monolix{\text{Monolix}}
 
$
 
 
 
==Some preliminary notations==
 
 
 
 
 
Let $\theta$ be the set of population parameters. We assume that $\theta$ takes its values in $\Theta$, an open subset of $\Rset^m$.
 
 
 
Let $f : \Theta \to \Rset$ be a twice differentiable function of $\theta$. We will denote $\Dt{f(\theta)} = (\partial f(\theta)/\partial \theta_j, 1 \leq j \leq m) $  the gradient of $f$ (i.e. the vector of partial derivatives of $f$) and $\DDt{f(\theta)} = (\partial^2 f(\theta)/\partial \theta_j\partial \theta_k, 1 \leq j,k \leq m) $  the Hessian  of $f$ (i.e.  the square matrix of second-order partial derivatives of $f$).
 
 
 
<br>
 
==Estimation of the observed F.I.M. using a stochastic approximation==
 
 
 
 
 
The ''observed'' Fisher information matrix (F.I.M.) is a function of $\theta$ defined as  
 
 
    
 
    
 
{{EquationWithRef
 
{{EquationWithRef
|equation=<div it="eq_fim1"><math>\begin{eqnarray}  
+
|equation=<div id="eq_fim1"><math>\begin{eqnarray}
I(\theta) &=& -\DDt{\log (\like(\theta;\by))} \\
+
I(\theta) &=& -\DDt{\log ({\like}(\theta;\by))} \\
&=& -\DDt{\log (\py(\by;\theta))}
+
&=& -\DDt{\log (\py(\by;\theta))} .
 
\end{eqnarray}</math></div>
 
\end{eqnarray}</math></div>
 
|reference=(1) }}
 
|reference=(1) }}
  
Due to the complex expression of the likelihood, $I(\theta)$ has no closed form expression. It is however possible to estimate it using a stochastic approximation procedure, based on <balloon title="Kuhn05: put here the reference!!" style="color:#177245">Louis' formula</balloon>:
+
Due to the likelihood being quite complex, $I(\theta)$ usually has no closed form expression. It is however possible to estimate it using a stochastic approximation procedure based on <balloon title="Kuhn05: put here the reference!!!" style="color:#177245">Louis' formula</balloon>:
 
 
  
 
{{Equation1
 
{{Equation1
|equation=<math>\DDt{\log (\pmacro(\by,\bpsi;\theta))} = \esp{\DDt{\log (\pmacro(\by,\bpsi;\theta))} {{!}} \by ;\theta} + \cov{\Dt{\log (\pmacro(\by,\bpsi;\theta))} {{!}} \by ; \theta},
+
|equation=<math>\DDt{\log (\pmacro(\by;\theta))} = \esp{\DDt{\log (\pmacro(\by,\bpsi;\theta))} {{!}} \by ;\theta} + \cov{\Dt{\log (\pmacro(\by,\bpsi;\theta))} {{!}} \by ; \theta},
 
</math> }}
 
</math> }}
  
where
+
where  
  
 
{{Equation1
 
{{Equation1
 
|equation=<math>\begin{eqnarray}
 
|equation=<math>\begin{eqnarray}
\cov{\Dt{\log (\pmacro(\by,\bpsi;\theta))} {{!}} \by ; \theta} &=&
+
\cov{\Dt{\log (\pmacro(\by,\bpsi;\theta))} {{!}} \by ; \theta} &=&
\esp{ \left(\Dt{\log (\pmacro(\by,\bpsi;\theta))} \right)\left(\Dt{\log (\pmacro(\by,\bpsi;\theta))}\right)^{\transpose} {{!}} \by ; \theta} \\
+
  \esp{ \left(\Dt{\log (\pmacro(\by,\bpsi;\theta))} \right)\left(\Dt{\log (\pmacro(\by,\bpsi;\theta))}\right)^{\transpose} {{!}} \by ; \theta} \\
&& - \esp{\Dt{\log (\pmacro(\by,\bpsi;\theta))} {{!}} \by ; \theta}\esp{\Dt{\log (\pmacro(\by,\bpsi;\theta))} {{!}} \by ; \theta}^{\transpose}
+
&& - \esp{\Dt{\log (\pmacro(\by,\bpsi;\theta))} {{!}} \by ; \theta}\esp{\Dt{\log (\pmacro(\by,\bpsi;\theta))} {{!}} \by ; \theta}^{\transpose} .
 
\end{eqnarray}</math> }}
 
\end{eqnarray}</math> }}
  
$\Dt{\log (\pmacro(\by,\bpsi;\theta))}$ is defined as a combination of conditional expectations. Each of these conditional expectations can be estimated by Monte-Carlo, or equivalently approximated using a stochastic approximation algorithm.
+
Thus, $\DDt{\log (\pmacro(\by;\theta))}$ is defined as a combination of conditional expectations. Each of these conditional expectations can be estimated by Monte Carlo, or equivalently approximated using a stochastic approximation algorithm.
 +
 
 +
We can then draw a sequence  $(\psi_i^{(k)})$ using a [[The Metropolis-Hastings algorithm for simulating the individual parameters|Metropolis-Hasting algorithm]] and estimate the observed F.I.M. online. At iteration $k$ of the algorithm:
  
We can then draw a sequence  $(\psi_i^{(k)})$ using a [[The Metropolis-Hastings algorithm for simulating the individual parameters|Metropolis-Hasting algorithm]] and estimate the observed F.I.M on-line.
 
At iteration $k$ of the algorithm:
 
 
 
  
* $\textbf{Simulation-step}$: for $i=1,2,\ldots N$, draw $\psi_i^{(k)}$ from $m$ iterations of the Metropolis-Hastings algorithm described in [[The Metropolis-Hastings algorithm for simulating the individual parameters]] with $\pmacro(\psi_i |y_i ;{\theta})$ as limiting distribution.
+
* '''Simulation step''': for $i=1,2,\ldots,N$, draw $\psi_i^{(k)}$ from $m$ iterations of the Metropolis-Hastings algorithm described in [[The Metropolis-Hastings algorithm for simulating the individual parameters| The Metropolis-Hastings algorithm]] section with $\pmacro(\psi_i |y_i ;{\theta})$ as the limit distribution.
 +
 
  
 +
* '''Stochastic approximation''': update $D_k$, $G_k$ and $\Delta_k$  according to the following recurrence relations:
  
* $\textbf{Stochastic approximation}$: update $D_k$, $G_k$ and $\Delta_k$  according to the following recurrent relations:
 
  
 
{{Equation1
 
{{Equation1
 
|equation=<math>\begin{eqnarray}
 
|equation=<math>\begin{eqnarray}
\Delta_k & = & \Delta_{k-1} + \gamma_k \left(\Dt{\log (\pmacro(\by,\bpsi^{(k)};{\theta}))} - \Delta_{k-1} \right), \\
+
\Delta_k & = & \Delta_{k-1} + \gamma_k \left(\Dt{\log (\pmacro(\by,\bpsi^{(k)};{\theta}))} - \Delta_{k-1} \right) \\
D_k & = & D_{k-1} + \gamma_k \left(\DDt{\log (\pmacro(\by,\bpsi^{(k)};{\theta}))} - D_{k-1} \right),\\
+
D_k & = & D_{k-1} + \gamma_k \left(\DDt{\log (\pmacro(\by,\bpsi^{(k)};{\theta}))} - D_{k-1} \right)\\
 
G_k & = & G_{k-1} + \gamma_k \left((\Dt{\log (\pmacro(\by,\bpsi^{(k)};{\theta}))})(\Dt{\log (\pmacro(\by,\bpsi^{(k)};{\theta}))})^\transpose -G_{k-1} \right),
 
G_k & = & G_{k-1} + \gamma_k \left((\Dt{\log (\pmacro(\by,\bpsi^{(k)};{\theta}))})(\Dt{\log (\pmacro(\by,\bpsi^{(k)};{\theta}))})^\transpose -G_{k-1} \right),
 
\end{eqnarray}</math> }}
 
\end{eqnarray}</math> }}
  
: where $(\gamma_k)$ is a decreasing sequence of positive numbers such that $\gamma_1=1$, $ \sum_{k=1}^{\infty} \gamma_k = \infty$ and $\sum_{k=1}^{\infty} \gamma_k^2 < \infty$.
+
: where $(\gamma_k)$ is a decreasing sequence of positive numbers such that $\gamma_1=1$, $ \sum_{k=1}^{\infty} \gamma_k = \infty$and $\sum_{k=1}^{\infty} \gamma_k^2 < \infty$.
  
  
* $\textbf{Estimation-step}$: update the estimate $H_k$ of the F.I.M. according to
+
* '''Estimation step''': update the estimate $H_k$ of the F.I.M. according to
  
 
{{Equation1
 
{{Equation1
|equation=<math> H_k  =  D_k + G_k - \Delta_k \Delta_k^{\transpose}. </math> }}
+
|equation=<math>H_k  =  D_k + G_k - \Delta_k \Delta_k^{\transpose}. </math> }}  
  
  
Implementation of this algorithm requires therefore to compute the first and second derivatives of
+
 
 +
Implementing this algorithm therefore requires computation of the first and second derivatives of
  
 
{{Equation1
 
{{Equation1
|equation=<math> \log (\pmacro(\by,\bpsi;\theta))=\sum_{i=1}^{N} \log (\pmacro(y_i,\psi_i;\theta)).
+
|equation=<math>\log (\pmacro(\by,\bpsi;\theta))=\sum_{i=1}^{N} \log (\pmacro(y_i,\psi_i;\theta)).</math> }}
</math> }}
 
  
 
Assume first that the joint distribution of $\by$ and $\bpsi$ decomposes as
 
Assume first that the joint distribution of $\by$ and $\bpsi$ decomposes as
Line 327: Line 67:
 
|reference=(2) }}
 
|reference=(2) }}
  
Such assumption means that, for any $i=1,2,\ldots N$, all the components of $\psi_i$ are random and that there exists a sufficient statistics ${\cal S}(\bpsi)$ for the estimation of $\theta$.
+
This assumption means that for any $i=1,2,\ldots,N$, all of the components of $\psi_i$ are random and there exists a sufficient statistic ${\cal S}(\bpsi)$ for the estimation of $\theta$. It is then sufficient to compute the first and second derivatives of $\log (\pmacro(\bpsi;\theta))$ in order to estimate the F.I.M. This can be done relatively simply in closed form when the individual parameters are normally distributed (or a transformation $h$ of them is).
It is then enough to compute the first and second derivatives of $\log (\pmacro(\bpsi;\theta))$ for estimating the F.I.M. This can be done relatively simply in a closed form when the individual parameters are normally distributed (eventually up to a transformation $h$).
 
  
If some component of $\psi_i$ has no variability, [[#eq:fim_dec1|(2)]] does not hold anymore but we will decompose $\theta$ into $(\theta_y,\theta_\psi)$ such that
+
If some component of $\psi_i$ has no variability, [[#eq:fim_dec1|(2)]] no longer holds, but we can decompose $\theta$ into $(\theta_y,\theta_\psi)$ such that
  
 
{{Equation1
 
{{Equation1
Line 337: Line 76:
 
</math> }}
 
</math> }}
  
We will then need to compute the first and second derivatives of $\log(\pcyipsii(y_i |\psi_i ; \theta_y))$ and $\log(\ppsii(\psi_i;\theta_\psi))$. Derivatives of $\log(\pcyipsii(y_i |\psi_i ; \theta_y))$ that don't have closed form expressions can be obtained by central differences.
+
We then need to compute the first and second derivatives of $\log(\pcyipsii(y_i |\psi_i ; \theta_y))$ and $\log(\ppsii(\psi_i;\theta_\psi))$. Derivatives of $\log(\pcyipsii(y_i |\psi_i ; \theta_y))$ that do not have a closed form expression can be obtained using central differences.
  
  
Line 343: Line 82:
 
|title=Remarks
 
|title=Remarks
 
|text=
 
|text=
<blockquote>
+
1. Using $\gamma_k=1/k$ for $k \geq 1$ means that each term is approximated with an empirical mean obtained from $(\bpsi^{(k)}, k \geq 1)$. For instance,
1. Using $\gamma_k=1/k$ for $k=1,2,\ldots$ means that we approximate each term with an empirical mean obtained from $(\bpsi^{(k)} , k=1,2,\ldots)$. For instance,
+
 
</blockquote>
 
 
{{EquationWithRef
 
{{EquationWithRef
 
|equation=<div id="eq:fim_Delta1"><math>\begin{eqnarray}
 
|equation=<div id="eq:fim_Delta1"><math>\begin{eqnarray}
\Delta_k &=&  \Delta_{k-1} + \frac{1}{k} \left(\Dt{\log (\pmacro(\by,\bpsi^{(k)};\theta))} - \Delta_{k-1} \right)  
+
\Delta_k
 +
&=&  \Delta_{k-1} + \displaystyle{ \frac{1}{k} } \left(\Dt{\log (\pmacro(\by,\bpsi^{(k)};\theta))} - \Delta_{k-1} \right)  
 
\end{eqnarray}</math></div>
 
\end{eqnarray}</math></div>
 
|reference=(3) }}
 
|reference=(3) }}
 
{{EquationWithRef
 
{{EquationWithRef
 
|equation=<div id="eq:fim_Delta2"><math>\begin{eqnarray}
 
|equation=<div id="eq:fim_Delta2"><math>\begin{eqnarray}
&=&  \frac{1}{k} \sum_{j=1}^{k} \Dt{\log (\pmacro(\by,\bpsi^{(j)};\theta))}   
+
&=&  \displaystyle{ \frac{1}{k} }\sum_{j=1}^{k} \Dt{\log (\pmacro(\by,\bpsi^{(j)};\theta))} .  
 
\end{eqnarray}</math></div>
 
\end{eqnarray}</math></div>
 
|reference=(4) }}
 
|reference=(4) }}
  
<blockquote>
+
[[#eq:fim_Delta1|(3)]] (resp. [[#eq:fim_Delta2|(4)]]) defines $\Delta_k$ using an online (resp. offline) algorithm. Writing $\Delta_k$ as in [[#eq:fim_Delta1|(3)]] instead of [[#eq:fim_Delta2|(4)]] avoids having to store all simulated sequences $(\bpsi^{(j)}, 1\leq j \leq k)$ when computing $\Delta_k$.
: [[#eq:fim_Delta1|(3)]] (resp. [[#eq:fim_Delta2|(4)]]) defines $\Delta_k$ using an online (resp. offline) algorithm. Writing $\Delta_k$ as in [[#eq:fim_Delta1|(3)]] instead of [[#eq:fim_Delta2|(4)]] avoids to store all the simulated sequences $(\bpsi^{(j)}, 1\leq j \leq k)$ for computing $\Delta_k$.
 
  
  
2. This approach is used in the practice for computing the F.I.M.  $I(\hat{\theta})$ where $\hat{\theta}$ is the maximum likelihood estimate of $\theta$. Then, the only difference with the Metropolis-Hastings used for SAEM is that the population parameter $\theta$ is not updated but remains fixed to $\hat{\theta}$.
+
2. This approach is used for computing the F.I.M.  $I(\hat{\theta})$ in practice, where $\hat{\theta}$ is the maximum likelihood estimate of $\theta$. The only difference with the [[The Metropolis-Hastings algorithm for simulating the individual parameters|Metropolis-Hastings]] used for SAEM is that the population parameter $\theta$ is not updated and remains fixed at $\hat{\theta}$.
</blockquote>
 
 
}}
 
}}
  
  
{{OutlineTextL
+
{{OutlineText
|text=In summary, for a given estimate $\hat{\theta}$ of the population parameter $\theta$, a stochastic approximation algorithm for estimating the observed Fisher Information Matrix $I(\hat{\theta)}$ consists in:
+
|text=In summary, for a given estimate $\hat{\theta}$ of the population parameter $\theta$, a stochastic approximation algorithm for estimating the observed Fisher Information Matrix $I(\hat{\theta)}$ consists of:
 
 
 
 
* for $i=1,2,\ldots N$, run a Metropolis-Hastings algorithm to draw a sequence $\psi_i^{(k)}$  with limiting distribution $\pmacro(\psi_i {{!}} y_i ;\hat{\theta})$,
 
* at iteration $k$ of the M-H algorithm, compute the first and second derivatives of $\pypsi(\by,\bpsi^{(k)};\hat{\theta})$
 
* update $\Delta_k$, $G_k$, $D_k$ and compute an estimate $H_k$ of the F.I.M.
 
}}
 
  
 +
<blockquote>
 +
1. For $i=1,2,\ldots,N$, run a [[The Metropolis-Hastings algorithm for simulating the individual parameters|Metropolis-Hastings algorithm]] to draw a sequence $\psi_i^{(k)}$ with limit distribution $\pmacro(\psi_i {{!}}y_i ;\hat{\theta})$.
 +
</blockquote>
 +
<blockquote>
 +
2. At iteration $k$ of the Metropolis-Hastings algorithm, compute the first and second derivatives of $\pypsi(\by,\bpsi^{(k)};\hat{\theta})$.
 +
</blockquote>
 +
<blockquote>
 +
3.Update $\Delta_k$, $G_k$, $D_k$ and compute an estimate $H_k$ of the F.I.M.
 +
</blockquote>
 +
}}
  
  
 
{{Example
 
{{Example
 
|title=Example 1
 
|title=Example 1
|text= Consider the model
+
|text=Consider the model
  
 
{{Equation1
 
{{Equation1
Line 387: Line 128:
 
\end{eqnarray}</math> }}
 
\end{eqnarray}</math> }}
  
where $\Omega = {\rm diag}(\omega_1^2,\omega_2^2,\ldots,\omega_d^2)$ is a diagonal matrix and $h(\psi_i)=(h_1(\psi_{i,1}), h_2(\psi_{i,2}), \ldots , h_d(\psi_{i,d}) )^{\transpose}$. The vector of population parameters is $\theta = (\psi_{\rm pop} , \Omega)=(\psi_{ {\rm pop},1},\ldots,\psi_{ {\rm pop},d},\omega_1^2,\ldots,\omega_d^2)$.
+
where $\Omega = {\rm diag}(\omega_1^2,\omega_2^2,\ldots,\omega_d^2)$ is a diagonal matrix and $h(\psi_i)=(h_1(\psi_{i,1}), h_2(\psi_{i,2}), \ldots , h_d(\psi_{i,d}) )^{\transpose}$.
 +
The vector of population parameters is $\theta = (\psi_{\rm pop} , \Omega)=(\psi_{ {\rm pop},1},\ldots,\psi_{ {\rm pop},d},\omega_1^2,\ldots,\omega_d^2)$.
  
 
Here,
 
Here,
  
 
{{Equation1
 
{{Equation1
|equation=<math> \log (\pyipsii(y_i,\psi_i;\theta)) = \log (\pcyipsii(y_i {{!}} \psi_i)) + \log (\ppsii(\psi_i;\theta)).
+
|equation=<math>  
 +
\log (\pyipsii(y_i,\psi_i;\theta)) = \log (\pcyipsii(y_i {{!}} \psi_i)) + \log (\ppsii(\psi_i;\theta)).
 
</math> }}
 
</math> }}
  
Line 400: Line 143:
 
|equation=<math>\begin{eqnarray}
 
|equation=<math>\begin{eqnarray}
 
\Dt{\log (\pyipsii(y_i,\psi_i;\theta))} &=& \Dt{\log (\ppsii(\psi_i;\theta))} \\
 
\Dt{\log (\pyipsii(y_i,\psi_i;\theta))} &=& \Dt{\log (\ppsii(\psi_i;\theta))} \\
\DDt{\log (\pyipsii(y_i,\psi_i;\theta))} &=& \DDt{\log (\ppsii(\psi_i;\theta))}
+
\DDt{\log (\pyipsii(y_i,\psi_i;\theta))} &=& \DDt{\log (\ppsii(\psi_i;\theta))} .
 
\end{eqnarray}</math> }}
 
\end{eqnarray}</math> }}
  
 
More precisely,
 
More precisely,
  
{{Equation1_special
+
{{Equation1
 
|equation=<math>\begin{eqnarray}
 
|equation=<math>\begin{eqnarray}
\log (\ppsii(\psi_i;\theta)) &=& -\frac{d}{2}\log(2\pi) + \sum_{\iparam=1}^d \log(h_\iparam^{\prime}(\psi_{i,\iparam}))
+
\log (\ppsii(\psi_i;\theta)) &=& -\displaystyle{\frac{d}{2} }\log(2\pi) + \sum_{\iparam=1}^d \log(h_\iparam^{\prime}(\psi_{i,\iparam}))
-\frac{1}{2} \sum_{\iparam=1}^d \log(\omega_\iparam^2)
+
-\displaystyle{ \frac{1}{2} } \sum_{\iparam=1}^d \log(\omega_\iparam^2)
-\sum_{\iparam=1}^d \frac{1}{2\, \omega_\iparam^2}( h_\iparam(\psi_{i,\iparam}) - h_\iparam(\psi_{ {\rm pop},\iparam}) )^2, \\
+
-\sum_{\iparam=1}^d \displaystyle{ \frac{1}{2\, \omega_\iparam^2} }( h_\iparam(\psi_{i,\iparam}) - h_\iparam(\psi_{ {\rm pop},\iparam}) )^2 \\
 
\partial \log (\ppsii(\psi_i;\theta))/\partial \psi_{ {\rm pop},\iparam}  &=&
 
\partial \log (\ppsii(\psi_i;\theta))/\partial \psi_{ {\rm pop},\iparam}  &=&
\frac{1}{\omega_\iparam^2} h_\iparam^{\prime}(\psi_{ {\rm pop},\iparam})( h_\iparam(\psi_{i,\iparam}) - h_\iparam(\psi_{ {\rm pop},\iparam}) ), \\
+
\displaystyle{\frac{1}{\omega_\iparam^2} }h_\iparam^{\prime}(\psi_{ {\rm pop},\iparam})( h_\iparam(\psi_{i,\iparam}) - h_\iparam(\psi_{ {\rm pop},\iparam}) ) \\
 
\partial \log (\ppsii(\psi_i;\theta))/\partial \omega^2_{\iparam}  &=&
 
\partial \log (\ppsii(\psi_i;\theta))/\partial \omega^2_{\iparam}  &=&
-\frac{1}{2\omega_\iparam^2}
+
-\displaystyle{ \frac{1}{2\omega_\iparam^2} }
+ \frac{1}{2\, \omega_\iparam^4}( h_\iparam(\psi_{i,\iparam}) - h_\iparam(\psi_{ {\rm pop},\iparam}) )^2, \\
+
+\displaystyle{\frac{1}{2\, \omega_\iparam^4} }( h_\iparam(\psi_{i,\iparam}) - h_\iparam(\psi_{ {\rm pop},\iparam}) )^2 \\
 
\partial^2 \log (\ppsii(\psi_i;\theta))/\partial \psi_{ {\rm pop},\iparam} \partial \psi_{ {\rm pop},\jparam}  &=&
 
\partial^2 \log (\ppsii(\psi_i;\theta))/\partial \psi_{ {\rm pop},\iparam} \partial \psi_{ {\rm pop},\jparam}  &=&
 
  \left\{
 
  \left\{
 
   \begin{array}{ll}
 
   \begin{array}{ll}
 
<!-- %    \frac{1}{\omega_\iparam^2} -->
 
<!-- %    \frac{1}{\omega_\iparam^2} -->
\left( h_\iparam^{\prime\prime}(\psi_{ {\rm pop},\iparam})( h_\iparam(\psi_{i,\iparam}) - h_\iparam(\psi_{ {\rm pop},\iparam}) )- h_\iparam^{\prime \, 2}(\psi_{ {\rm pop},\iparam}) \right)/\omega_\iparam^2, & {\rm \quad if \quad} \iparam=\jparam \\
+
\left( h_\iparam^{\prime\prime}(\psi_{ {\rm pop},\iparam})( h_\iparam(\psi_{i,\iparam}) - h_\iparam(\psi_{ {\rm pop},\iparam}) )- h_\iparam^{\prime \, 2}(\psi_{ {\rm pop},\iparam}) \right)/\omega_\iparam^2 & {\rm if \quad } \iparam=\jparam \\
     0, & {\rm \quad otherwise,}
+
     0 & {\rm otherwise}
 
   \end{array}
 
   \end{array}
 
  \right.
 
  \right.
\\
+
\\
 
\partial^2 \log (\ppsii(\psi_i;\theta))/\partial \omega^2_{\iparam} \partial \omega^2_{\jparam} &=& \left\{
 
\partial^2 \log (\ppsii(\psi_i;\theta))/\partial \omega^2_{\iparam} \partial \omega^2_{\jparam} &=& \left\{
 
   \begin{array}{ll}
 
   \begin{array}{ll}
<!-- %    \frac{1}{2\omega_\iparam^4} - \frac{1}{\omega_\iparam^6} -->
+
<!-- %    \frac{1}{2\omega_\iparam^4} - \frac{1}{\omega_\iparam^6} -->
 
1/(2\omega_\iparam^4) -
 
1/(2\omega_\iparam^4) -
( h_\iparam(\psi_{i,\iparam}) - h_\iparam(\psi_{ {\rm pop},\iparam}) )^2/\omega_\iparam^6, & {\quad \rm if \quad} \iparam=\jparam \\
+
( h_\iparam(\psi_{i,\iparam}) - h_\iparam(\psi_{ {\rm pop},\iparam}) )^2/\omega_\iparam^6 & {\rm if \quad} \iparam=\jparam \\
     0, & {\rm \quad otherwise,}
+
     0 & {\rm otherwise}
 
   \end{array}
 
   \end{array}
 
  \right.
 
  \right.
Line 436: Line 179:
 
   \begin{array}{ll}
 
   \begin{array}{ll}
 
<!-- %      -\frac{1}{\omega_\iparam^4} -->
 
<!-- %      -\frac{1}{\omega_\iparam^4} -->
-h_\iparam^{\prime}(\psi_{ {\rm pop},\iparam})( h_\iparam(\psi_{i,\iparam}) - h_\iparam(\psi_{ {\rm pop},\iparam}) )/\omega_\iparam^4 \\
+
-h_\iparam^{\prime}(\psi_{ {\rm pop},\iparam})( h_\iparam(\psi_{i,\iparam}) - h_\iparam(\psi_{ {\rm pop},\iparam}) )/\omega_\iparam^4 & {\rm if \quad} \iparam=\jparam \\
     0, & {\rm \quad \quad otherwise.}
+
     0 & {\rm otherwise.}
 
   \end{array}
 
   \end{array}
 
  \right.
 
  \right.
Line 447: Line 190:
 
{{Example
 
{{Example
 
|title=Example 2
 
|title=Example 2
|text= We consider the same  model for continuous data, assuming a constant error model and assuming that the variance $a^2$ of the residual error has no variability:
+
|text= We consider the same  model for continuous data, assuming a constant error model and that the variance $a^2$ of the residual error has no variability:
  
 
{{Equation1
 
{{Equation1
 
|equation=<math>\begin{eqnarray}
 
|equation=<math>\begin{eqnarray}
y_{ij} {{!}} \psi_i &\sim& {\cal N}(f(t_{ij}, \psi_i) \ , \ a^2) \ \ ; \ \ 1 \leq j \leq n_i \\
+
y_{ij} {{!}} \psi_i &\sim& {\cal N}(f(t_{ij}, \psi_i) \ , \ a^2), \ \ 1 \leq j \leq n_i \\
h(\psi_i) &\sim_{i.i.d}& {\cal N}( h(\psi_{\rm pop}) , \Omega),
+
h(\psi_i) &\sim_{i.i.d}& {\cal N}( h(\psi_{\rm pop}) , \Omega).
 
\end{eqnarray}</math> }}
 
\end{eqnarray}</math> }}
  
Line 458: Line 201:
  
 
{{Equation1
 
{{Equation1
|equation=<math>\begin{equation}
+
|equation=<math>
\log(\pyipsii(y_i,\psi_i;\theta)) = \log(\pcyipsii(y_i {{!}} \psi_i ; a^2)) + \log(\ppsii(\psi_i;\psi_{\rm pop},\Omega)).
+
\log(\pyipsii(y_i,\psi_i;\theta)) = \log(\pcyipsii(y_i {{!}} \psi_i ; a^2)) + \log(\ppsii(\psi_i;\psi_{\rm pop},\Omega)),
\end{equation}</math> }}
+
</math> }}
  
 
where
 
where
  
 
{{Equation1
 
{{Equation1
|equation=<math> \log(\pcyipsii(y_i {{!}} \psi_i ; a^2))
+
|equation=<math>
=-\frac{n_i}{2}\log(2\pi)- \frac{n_i}{2}\log(a^2) - frac{1}{2a^2}\sum_{j=1}^{n_i}(y_{ij} - f(t_{ij}, \psi_i))^2  
+
\log(\pcyipsii(y_i {{!}} \psi_i ; a^2))
 +
=-\displaystyle{\frac{n_i}{2} }\log(2\pi)- \displaystyle{\frac{n_i}{2} }\log(a^2) - \displaystyle{\frac{1}{2a^2} }\sum_{j=1}^{n_i}(y_{ij} - f(t_{ij}, \psi_i))^2 .
 
</math> }}
 
</math> }}
  
Derivatives of $\log(\pcyipsii(y_i {{!}}\psi_i ; a^2))$ with respect to $a^2$ are straightforward to compute. Derivatives of $\log(\ppsii(\psi_i;\psi_{\rm pop},\Omega))$ with respect to $\psi_{\rm pop}$ and $\Omega$ remain unchanged.
+
Derivatives of $\log(\pcyipsii(y_i {{!}} \psi_i ; a^2))$ with respect to $a^2$ are straightforward to compute. Derivatives of $\log(\ppsii(\psi_i;\psi_{\rm pop},\Omega))$ with respect to $\psi_{\rm pop}$ and $\Omega$ remain unchanged.
 
}}
 
}}
 +
  
  
Line 476: Line 221:
 
{{Example
 
{{Example
 
|title=Example 3
 
|title=Example 3
|text= We still consider the same  model for continuous data,  assuming now that a subset $\xi$ of the parameters of the structural model has no variability:
+
|text= Consider again the same  model for continuous data,  assuming now that a subset $\xi$ of the parameters of the structural model has no variability:
  
 
{{Equation1
 
{{Equation1
 
|equation=<math>\begin{eqnarray}
 
|equation=<math>\begin{eqnarray}
y_{ij} {{!}} \psi_i &\sim& {\cal N}(f(t_{ij}, \psi_i,\xi) \ , \ a^2) \ \ ; \ \ 1 \leq j \leq n_i \\
+
y_{ij} {{!}} \psi_i &\sim& {\cal N}(f(t_{ij}, \psi_i,\xi) \ , \ a^2), \ \ 1 \leq j \leq n_i \\
h(\psi_i) &\sim_{i.i.d}& {\cal N}( h(\psi_{\rm pop}) , \Omega),
+
h(\psi_i) &\sim_{i.i.d}& {\cal N}( h(\psi_{\rm pop}) , \Omega).
 
\end{eqnarray}</math> }}
 
\end{eqnarray}</math> }}
  
$\psi$ remains the subset of individual parameters with variability. Here, $\theta_y=(\xi,a^2)$, $\theta_\psi=(\psi_{\rm pop},\Omega)$, and
+
Let $\psi$ remain as the subset of individual parameters with variability. Here, $\theta_y=(\xi,a^2)$, $\theta_\psi=(\psi_{\rm pop},\Omega)$, and
  
 
{{Equation1
 
{{Equation1
|equation=<math> \log(\pcyipsii(y_i {{!}} \psi_i ; \xi,a^2))
+
|equation=<math>  
=-\frac{n_i}{2}\log(2\pi)- \frac{n_i}{2}\log(a^2) - \frac{1}{2 a^2}\sum_{j=1}^{n_i}(y_{ij} - f(t_{ij}, \psi_i,\xi))^2  
+
\log(\pcyipsii(y_i {{!}} \psi_i ; \xi,a^2))
 +
=-\displaystyle{\frac{n_i}{2} }\log(2\pi)- \displaystyle{\frac{n_i}{2} }\log(a^2) - \displaystyle{\frac{1}{2 a^2} }\sum_{j=1}^{n_i}(y_{ij} - f(t_{ij}, \psi_i,\xi))^2 .
 
</math> }}
 
</math> }}
  
Derivatives of $\log(\pcyipsii(y_i {{!}}\psi_i ; \xi, a^2))$ with respect to $\xi$ require to compute the derivative of $f$ with respect to $\xi$. These derivatives are usually not available. An alternative is to numerically approximate these derivatives using finite differences.
+
Derivatives of $\log(\pcyipsii(y_i {{!}} \psi_i ; \xi, a^2))$ with respect to $\xi$ require computation of the derivative of $f$ with respect to $\xi$. These derivatives are usually not calculable. One possibility is to numerically approximate them using finite differences.
 
}}
 
}}
  
Line 497: Line 243:
 
<br>
 
<br>
  
==Estimation of the F.I.M. using a linearization of the model==
+
== Estimation using linearization of the model ==  
Consider here a model for continuous data, using the $\phi$-parametrization for the individual parameters:
+
 
 +
Consider here a model for continuous data that uses a $\phi$-parametrization for the individual parameters:
  
 
{{Equation1
 
{{Equation1
 
|equation=<math>\begin{eqnarray}
 
|equation=<math>\begin{eqnarray}
 
y_{ij} &= & f(t_{ij} , \phi_i) + g(t_{ij} , \phi_i)\teps_{ij} \\
 
y_{ij} &= & f(t_{ij} , \phi_i) + g(t_{ij} , \phi_i)\teps_{ij} \\
\phi_i &=& \phi_{\rm pop} + \eta_i
+
\phi_i &=& \phi_{\rm pop} + \eta_i .
 
\end{eqnarray}</math> }}
 
\end{eqnarray}</math> }}
  
Let $\hphi_i$ be some predicted value of $\phi_i$ ($\hphi_i$ can be for instance the estimated mean or the estimated mode of the conditional distribution $\pmacro(\phi_i |y_i ; \hat{\theta})$).
+
Let $\hphi_i$ be some predicted value of $\phi_i$, such as for instance the estimated mean or estimated mode of the conditional distribution $\pmacro(\phi_i |y_i ; \hat{\theta})$.
  
We can therefore linearize the model for the observations $(y_{ij}, 1\leq j \leq n_i)$ of individual $i$ around the vector of predicted individual parameters. Let $\Dphi{f(t , \phi)}$ be the row vector of derivatives of  $f(t , \phi)$ with respect to $\phi$. Then,
+
We can then choose to linearize the model for the observations $(y_{ij}, 1\leq j \leq n_i)$ of individual $i$ around the vector of predicted individual parameters. Let $\Dphi{f(t , \phi)}$ be the row vector of derivatives of  $f(t , \phi)$ with respect to $\phi$. Then,
  
 
{{Equation1
 
{{Equation1
Line 514: Line 261:
 
y_{ij} &\simeq& f(t_{ij} , \hphi_i) + \Dphi{f(t_{ij} , \hphi_i)} \, (\phi_i - \hphi_i)  + g(t_{ij} , \hphi_i)\teps_{ij} \\
 
y_{ij} &\simeq& f(t_{ij} , \hphi_i) + \Dphi{f(t_{ij} , \hphi_i)} \, (\phi_i - \hphi_i)  + g(t_{ij} , \hphi_i)\teps_{ij} \\
 
&\simeq& f(t_{ij} , \hphi_i) +  \Dphi{f(t_{ij} , \hphi_i)} \, (\phi_{\rm pop} - \hphi_i)
 
&\simeq& f(t_{ij} , \hphi_i) +  \Dphi{f(t_{ij} , \hphi_i)} \, (\phi_{\rm pop} - \hphi_i)
+  \Dphi{f(t_{ij} , \hphi_i)} \, \eta_i  + g(t_{ij} , \hphi_i)\teps_{ij}
+
+  \Dphi{f(t_{ij} , \hphi_i)} \, \eta_i  + g(t_{ij} , \hphi_i)\teps_{ij} .
 
\end{eqnarray}</math> }}
 
\end{eqnarray}</math> }}
  
We can then approximate the marginal distribution of the vector $y_i$ as a normal distribution:
+
Then, we can approximate the marginal distribution of the vector $y_i$ as a normal distribution:
  
 
{{EquationWithRef
 
{{EquationWithRef
 
|equation=<div id="eq:fim_approx"><math>
 
|equation=<div id="eq:fim_approx"><math>
y_{i} \approx {\cal N}\left(f(t_{i} , \hphi_i) +  \Dphi{f(t_{i} , \hphi_i)} \, (\phi_{\rm pop} - \hphi_i) ,
+
y_{i} \approx {\cal N}\left(f(t_{i} , \hphi_i) +  \Dphi{f(t_{i} , \hphi_i)} \, (\phi_{\rm pop} - \hphi_i) ,
\Dphi{f(t_{i} , \hphi_i)} \Omega \Dphi{f(t_{i} , \hphi_i)}^{\transpose}  + g(t_{i} , \hphi_i)\Sigma_{n_i} g(t_{ij} , \hphi_i)^{\transpose} \right).
+
\Dphi{f(t_{i} , \hphi_i)} \Omega \Dphi{f(t_{i} , \hphi_i)}^{\transpose}  + g(t_{i} , \hphi_i)\Sigma_{n_i} g(t_{ij} , \hphi_i)^{\transpose} \right),
 
</math></div>
 
</math></div>
 
|reference=(5) }}
 
|reference=(5) }}
  
where $\Sigma_{n_i}$ is the variance-covariance matrix of $\teps_{i,1},\ldots,\teps_{i,n_i})$. If the $\teps_{ij}$'s are {\id i.i.d}, then $\Sigma_{n_i}$ is the identity matrix.
+
where $\Sigma_{n_i}$ is the variance-covariance matrix of $\teps_{i,1},\ldots,\teps_{i,n_i}$. If the $\teps_{ij}$ are i.i.d., then
 +
$\Sigma_{n_i}$ is the identity matrix.
  
 
+
We can equivalently use the original $\psi$-parametrization and the fact that $\phi_i=h(\psi_i)$. Then,
We can equivalently use the original $\psi$-parametrization using the fact that $\phi_i=h(\psi_i)$. Then,
 
  
 
{{Equation1
 
{{Equation1
|equation=<math> \Dphi{f(t_{i} , \hphi_i)} = \Dpsi{f(t_{i} , \hpsi_i)} J_h(\hpsi_i)^{\transpose}
+
|equation=<math>
</math> }}
+
\Dphi{f(t_{i} , \hphi_i)} = \Dpsi{f(t_{i} , \hpsi_i)} J_h(\hpsi_i)^{\transpose} , </math> }}
  
 
where $J_h$ is the Jacobian of $h$.
 
where $J_h$ is the Jacobian of $h$.
  
 +
We then can approximate the observed log-likelihood ${\llike}(\theta) = \log(\like(\theta;\by))=\sum_{i=1}^N \log(\pyi(y_i;\theta))$ using this normal approximation. We can also derive the F.I.M. by computing the matrix of second-order partial derivatives of ${\llike}(\theta)$.
  
We can then approximate the observed log-likelihood ${\llike}(\theta) = \log({\like}(\theta;\by))=\sum_{i=1}^N \log(\pyi(y_i;\theta))$ using this normal approximation. We can also derive the F.I.M by computing the matrix of second-order partial derivatives of ${\llike}(\theta)$.
+
Except for very simple models, computing these second-order partial derivatives in closed form is not straightforward. In such cases, finite differences can be used for numerically approximating them. We can use for instance a central difference approximation of the second derivative of $\llike(\theta)$. To this end, let $\nu>0$. For $j=1,2,\ldots, m$, let $\nu^{(j)}=(\nu^{(j)}_{k}, 1\leq k \leq m)$ be the $m$-vector such that
 
 
 
 
Except for very simple models, computing these second-order order derivatives in a closed form is not straightforward. Then, finite difference can be used for approximating numerically these quantities. We can use for instance a central difference approximation of the second derivative of ${\llike}(\theta)$:
 
 
 
Let $\nu>0$. For $j=1,2,\ldots, m$, let $\nu^{(j)}=(\nu^{(j)}_{k}, 1\leq k \leq m)$ be the $m$-vector such that
 
  
 
{{Equation1
 
{{Equation1
Line 549: Line 292:
 
\nu^{(j)}_{k} = \left\{
 
\nu^{(j)}_{k} = \left\{
 
                   \begin{array}{ll}
 
                   \begin{array}{ll}
                     \nu, & {\rm if \quad} j= k; \\
+
                     \nu & {\rm if \quad j= k} \\
                     0, & {\rm otherwise.}
+
                     0 & {\rm otherwise.}
 
                   \end{array}
 
                   \end{array}
 
                 \right.
 
                 \right.
Line 557: Line 300:
 
Then, for $\nu$ small enough,
 
Then, for $\nu$ small enough,
  
 +
{{Equation1
 +
|equation=<math>\begin{eqnarray}
 +
\partial_{\theta_j}{ {\llike}(\theta)} &\approx& \displaystyle{ \frac{ {\llike}(\theta+\nu^{(j)})- {\llike}(\theta-\nu^{(j)})}{2\nu} }  \\
 +
\end{eqnarray}</math> }}
 
{{EquationWithRef
 
{{EquationWithRef
 
|equation=<div id="eq:fim_diff"><math>\begin{eqnarray}
 
|equation=<div id="eq:fim_diff"><math>\begin{eqnarray}
\partial_{\theta_j}{ {\llike}(\theta)} &\approx& \displaystyle{ \frac{ {\llike}(\theta+\nu^{(j)})- {\llike}(\theta-\nu^{(j)})}{2\nu} } \ , \\
+
\partial^2_{\theta_j,\theta_k}{ {\llike}(\theta)} &\approx& \displaystyle{\frac{ {\llike}(\theta+\nu^{(j)}+\nu^{(k)})- {\llike}(\theta+\nu^{(j)}-\nu^{(k)})
\partial^2_{\theta_j,\theta_k}{ {\llike}(\theta)} &\approx& \displaystyle{ \frac{ {\llike}(\theta+\nu^{(j)}+\nu^{(k)})-{\llike}(\theta+\nu^{(j)}-\nu^{(k)})
 
 
-{\llike}(\theta-\nu^{(j)}+\nu^{(k)})+{\llike}(\theta-\nu^{(j)}-\nu^{(k)})}{4\nu^2} } .  
 
-{\llike}(\theta-\nu^{(j)}+\nu^{(k)})+{\llike}(\theta-\nu^{(j)}-\nu^{(k)})}{4\nu^2} } .  
 
\end{eqnarray}</math></div>
 
\end{eqnarray}</math></div>
 
|reference=(6) }}
 
|reference=(6) }}
  
 +
<br><br>
 +
------
 +
<br><br>
  
{{OutlineTextL
+
{{OutlineText
|text=  
+
|text=In summary, for a given estimate $\hat{\theta}$ of the population parameter $\theta$, the algorithm for approximating the Fisher Information Matrix $I(\hat{\theta)}$ using a linear approximation of the model consists of:
In summary, for a given estimate $\hat{\theta}$ of the population parameter $\theta$, the algorithm for approximating the Fisher Information Matrix $I(\hat{\theta)}$ using a linear approximation of the model consists in:
 
  
<blockquote>  
+
<blockquote>
1. for $i=1,2,\ldots N$, get some estimate $(\hpsi_i)$ of the individual parameters $(\psi_i)$ (we can average for example the last terms of the sequence $(\psi_i^{(k)})$ drawn during the last iterations of the SAEM algorithm),
+
1. For $i=1,2,\ldots,N$, obtain some estimate $(\hpsi_i)$ of the individual parameters $(\psi_i)$ (we can average for example the final terms of the sequence $(\psi_i^{(k)})$ drawn during the final iterations of the [[The SAEM algorithm for estimating population parameters| SAEM algorithm]]).
 
+
</blockquote>
2. for $i=1,2,\ldots N$, compute $\hphi_i=h(\hpsi_i)$, compute the mean and the variance of the normal distribution defined in [[#eq:fim_approx|(5)]], and compute ${\llike}(\theta)$ using this normal approximation,
+
<blockquote>
 
+
2. For $i=1,2,\ldots,N$, compute $\hphi_i=h(\hpsi_i)$, the mean and the variance of the normal distribution defined in [[#eq:fim_approx|(5)]], and ${\llike}(\theta)$ using this normal approximation.
3.  use [[#eq:fim_diff|(6)]] to approximate the matrix of second-order derivatives of ${\llike}(\theta)$.
+
</blockquote>
 +
<blockquote>
 +
3. Use [[#eq:fim_diff|(6)]] to approximate the matrix of second-order derivatives of ${\llike}(\theta)$.
 
</blockquote>
 
</blockquote>
 
}}
 
}}
  
<br>
+
{{Back&Next
 +
|linkNext=Estimation of the log-likelihood
 +
|linkBack=The Metropolis-Hastings algorithm for simulating the individual parameters }}

Latest revision as of 11:09, 28 August 2013

$ \def\hphi{\tilde{\phi}} $

Estimation using stochastic approximation

The observed Fisher information matrix (F.I.M.) is a function of $\theta$ defined as

\(\begin{eqnarray} I(\theta) &=& -\DDt{\log ({\like}(\theta;\by))} \\ &=& -\DDt{\log (\py(\by;\theta))} . \end{eqnarray}\)
(1)

Due to the likelihood being quite complex, $I(\theta)$ usually has no closed form expression. It is however possible to estimate it using a stochastic approximation procedure based on Louis' formula:

\(\DDt{\log (\pmacro(\by;\theta))} = \esp{\DDt{\log (\pmacro(\by,\bpsi;\theta))} | \by ;\theta} + \cov{\Dt{\log (\pmacro(\by,\bpsi;\theta))} | \by ; \theta}, \)

where

\(\begin{eqnarray} \cov{\Dt{\log (\pmacro(\by,\bpsi;\theta))} | \by ; \theta} &=& \esp{ \left(\Dt{\log (\pmacro(\by,\bpsi;\theta))} \right)\left(\Dt{\log (\pmacro(\by,\bpsi;\theta))}\right)^{\transpose} | \by ; \theta} \\ && - \esp{\Dt{\log (\pmacro(\by,\bpsi;\theta))} | \by ; \theta}\esp{\Dt{\log (\pmacro(\by,\bpsi;\theta))} | \by ; \theta}^{\transpose} . \end{eqnarray}\)

Thus, $\DDt{\log (\pmacro(\by;\theta))}$ is defined as a combination of conditional expectations. Each of these conditional expectations can be estimated by Monte Carlo, or equivalently approximated using a stochastic approximation algorithm.

We can then draw a sequence $(\psi_i^{(k)})$ using a Metropolis-Hasting algorithm and estimate the observed F.I.M. online. At iteration $k$ of the algorithm:


  • Simulation step: for $i=1,2,\ldots,N$, draw $\psi_i^{(k)}$ from $m$ iterations of the Metropolis-Hastings algorithm described in The Metropolis-Hastings algorithm section with $\pmacro(\psi_i |y_i ;{\theta})$ as the limit distribution.


  • Stochastic approximation: update $D_k$, $G_k$ and $\Delta_k$ according to the following recurrence relations:


\(\begin{eqnarray} \Delta_k & = & \Delta_{k-1} + \gamma_k \left(\Dt{\log (\pmacro(\by,\bpsi^{(k)};{\theta}))} - \Delta_{k-1} \right) \\ D_k & = & D_{k-1} + \gamma_k \left(\DDt{\log (\pmacro(\by,\bpsi^{(k)};{\theta}))} - D_{k-1} \right)\\ G_k & = & G_{k-1} + \gamma_k \left((\Dt{\log (\pmacro(\by,\bpsi^{(k)};{\theta}))})(\Dt{\log (\pmacro(\by,\bpsi^{(k)};{\theta}))})^\transpose -G_{k-1} \right), \end{eqnarray}\)

where $(\gamma_k)$ is a decreasing sequence of positive numbers such that $\gamma_1=1$, $ \sum_{k=1}^{\infty} \gamma_k = \infty$, and $\sum_{k=1}^{\infty} \gamma_k^2 < \infty$.


  • Estimation step: update the estimate $H_k$ of the F.I.M. according to

\(H_k = D_k + G_k - \Delta_k \Delta_k^{\transpose}. \)


Implementing this algorithm therefore requires computation of the first and second derivatives of

\(\log (\pmacro(\by,\bpsi;\theta))=\sum_{i=1}^{N} \log (\pmacro(y_i,\psi_i;\theta)).\)

Assume first that the joint distribution of $\by$ and $\bpsi$ decomposes as

\( \pypsi(\by,\bpsi;\theta) = \pcypsi(\by | \bpsi)\ppsi(\bpsi;\theta). \)
(2)

This assumption means that for any $i=1,2,\ldots,N$, all of the components of $\psi_i$ are random and there exists a sufficient statistic ${\cal S}(\bpsi)$ for the estimation of $\theta$. It is then sufficient to compute the first and second derivatives of $\log (\pmacro(\bpsi;\theta))$ in order to estimate the F.I.M. This can be done relatively simply in closed form when the individual parameters are normally distributed (or a transformation $h$ of them is).

If some component of $\psi_i$ has no variability, (2) no longer holds, but we can decompose $\theta$ into $(\theta_y,\theta_\psi)$ such that

\( \pyipsii(y_i,\psi_i;\theta) = \pcyipsii(y_i | \psi_i ; \theta_y)\ppsii(\psi_i;\theta_\psi). \)

We then need to compute the first and second derivatives of $\log(\pcyipsii(y_i |\psi_i ; \theta_y))$ and $\log(\ppsii(\psi_i;\theta_\psi))$. Derivatives of $\log(\pcyipsii(y_i |\psi_i ; \theta_y))$ that do not have a closed form expression can be obtained using central differences.


Remarks

1. Using $\gamma_k=1/k$ for $k \geq 1$ means that each term is approximated with an empirical mean obtained from $(\bpsi^{(k)}, k \geq 1)$. For instance,
\(\begin{eqnarray} \Delta_k &=& \Delta_{k-1} + \displaystyle{ \frac{1}{k} } \left(\Dt{\log (\pmacro(\by,\bpsi^{(k)};\theta))} - \Delta_{k-1} \right) \end{eqnarray}\)
(3)
\(\begin{eqnarray} &=& \displaystyle{ \frac{1}{k} }\sum_{j=1}^{k} \Dt{\log (\pmacro(\by,\bpsi^{(j)};\theta))} . \end{eqnarray}\)
(4)

(3) (resp. (4)) defines $\Delta_k$ using an online (resp. offline) algorithm. Writing $\Delta_k$ as in (3) instead of (4) avoids having to store all simulated sequences $(\bpsi^{(j)}, 1\leq j \leq k)$ when computing $\Delta_k$.


2. This approach is used for computing the F.I.M. $I(\hat{\theta})$ in practice, where $\hat{\theta}$ is the maximum likelihood estimate of $\theta$. The only difference with the Metropolis-Hastings used for SAEM is that the population parameter $\theta$ is not updated and remains fixed at $\hat{\theta}$.


In summary, for a given estimate $\hat{\theta}$ of the population parameter $\theta$, a stochastic approximation algorithm for estimating the observed Fisher Information Matrix $I(\hat{\theta)}$ consists of:

1. For $i=1,2,\ldots,N$, run a Metropolis-Hastings algorithm to draw a sequence $\psi_i^{(k)}$ with limit distribution $\pmacro(\psi_i |y_i ;\hat{\theta})$.

2. At iteration $k$ of the Metropolis-Hastings algorithm, compute the first and second derivatives of $\pypsi(\by,\bpsi^{(k)};\hat{\theta})$.

3.Update $\Delta_k$, $G_k$, $D_k$ and compute an estimate $H_k$ of the F.I.M.


Man02.jpg
Example 1


Consider the model

\(\begin{eqnarray} y_i | \psi_i &\sim& \pcyipsii(y_i | \psi_i) \\ h(\psi_i) &\sim_{i.i.d}& {\cal N}( h(\psi_{\rm pop}) , \Omega), \end{eqnarray}\)

where $\Omega = {\rm diag}(\omega_1^2,\omega_2^2,\ldots,\omega_d^2)$ is a diagonal matrix and $h(\psi_i)=(h_1(\psi_{i,1}), h_2(\psi_{i,2}), \ldots , h_d(\psi_{i,d}) )^{\transpose}$. The vector of population parameters is $\theta = (\psi_{\rm pop} , \Omega)=(\psi_{ {\rm pop},1},\ldots,\psi_{ {\rm pop},d},\omega_1^2,\ldots,\omega_d^2)$.

Here,

\( \log (\pyipsii(y_i,\psi_i;\theta)) = \log (\pcyipsii(y_i | \psi_i)) + \log (\ppsii(\psi_i;\theta)). \)

Then,

\(\begin{eqnarray} \Dt{\log (\pyipsii(y_i,\psi_i;\theta))} &=& \Dt{\log (\ppsii(\psi_i;\theta))} \\ \DDt{\log (\pyipsii(y_i,\psi_i;\theta))} &=& \DDt{\log (\ppsii(\psi_i;\theta))} . \end{eqnarray}\)

More precisely,

\(\begin{eqnarray} \log (\ppsii(\psi_i;\theta)) &=& -\displaystyle{\frac{d}{2} }\log(2\pi) + \sum_{\iparam=1}^d \log(h_\iparam^{\prime}(\psi_{i,\iparam})) -\displaystyle{ \frac{1}{2} } \sum_{\iparam=1}^d \log(\omega_\iparam^2) -\sum_{\iparam=1}^d \displaystyle{ \frac{1}{2\, \omega_\iparam^2} }( h_\iparam(\psi_{i,\iparam}) - h_\iparam(\psi_{ {\rm pop},\iparam}) )^2 \\ \partial \log (\ppsii(\psi_i;\theta))/\partial \psi_{ {\rm pop},\iparam} &=& \displaystyle{\frac{1}{\omega_\iparam^2} }h_\iparam^{\prime}(\psi_{ {\rm pop},\iparam})( h_\iparam(\psi_{i,\iparam}) - h_\iparam(\psi_{ {\rm pop},\iparam}) ) \\ \partial \log (\ppsii(\psi_i;\theta))/\partial \omega^2_{\iparam} &=& -\displaystyle{ \frac{1}{2\omega_\iparam^2} } +\displaystyle{\frac{1}{2\, \omega_\iparam^4} }( h_\iparam(\psi_{i,\iparam}) - h_\iparam(\psi_{ {\rm pop},\iparam}) )^2 \\ \partial^2 \log (\ppsii(\psi_i;\theta))/\partial \psi_{ {\rm pop},\iparam} \partial \psi_{ {\rm pop},\jparam} &=& \left\{ \begin{array}{ll} \left( h_\iparam^{\prime\prime}(\psi_{ {\rm pop},\iparam})( h_\iparam(\psi_{i,\iparam}) - h_\iparam(\psi_{ {\rm pop},\iparam}) )- h_\iparam^{\prime \, 2}(\psi_{ {\rm pop},\iparam}) \right)/\omega_\iparam^2 & {\rm if \quad } \iparam=\jparam \\ 0 & {\rm otherwise} \end{array} \right. \\ \partial^2 \log (\ppsii(\psi_i;\theta))/\partial \omega^2_{\iparam} \partial \omega^2_{\jparam} &=& \left\{ \begin{array}{ll} 1/(2\omega_\iparam^4) - ( h_\iparam(\psi_{i,\iparam}) - h_\iparam(\psi_{ {\rm pop},\iparam}) )^2/\omega_\iparam^6 & {\rm if \quad} \iparam=\jparam \\ 0 & {\rm otherwise} \end{array} \right. \\ \partial^2 \log (\ppsii(\psi_i;\theta))/\partial \psi_{ {\rm pop},\iparam} \partial \omega^2_{\jparam} &=& \left\{ \begin{array}{ll} -h_\iparam^{\prime}(\psi_{ {\rm pop},\iparam})( h_\iparam(\psi_{i,\iparam}) - h_\iparam(\psi_{ {\rm pop},\iparam}) )/\omega_\iparam^4 & {\rm if \quad} \iparam=\jparam \\ 0 & {\rm otherwise.} \end{array} \right. \end{eqnarray}\)



Man02.jpg
Example 2


We consider the same model for continuous data, assuming a constant error model and that the variance $a^2$ of the residual error has no variability:

\(\begin{eqnarray} y_{ij} | \psi_i &\sim& {\cal N}(f(t_{ij}, \psi_i) \ , \ a^2), \ \ 1 \leq j \leq n_i \\ h(\psi_i) &\sim_{i.i.d}& {\cal N}( h(\psi_{\rm pop}) , \Omega). \end{eqnarray}\)

Here, $\theta_y=a^2$, $\theta_\psi=(\psi_{\rm pop},\Omega)$ and

\( \log(\pyipsii(y_i,\psi_i;\theta)) = \log(\pcyipsii(y_i | \psi_i ; a^2)) + \log(\ppsii(\psi_i;\psi_{\rm pop},\Omega)), \)

where

\( \log(\pcyipsii(y_i | \psi_i ; a^2)) =-\displaystyle{\frac{n_i}{2} }\log(2\pi)- \displaystyle{\frac{n_i}{2} }\log(a^2) - \displaystyle{\frac{1}{2a^2} }\sum_{j=1}^{n_i}(y_{ij} - f(t_{ij}, \psi_i))^2 . \)

Derivatives of $\log(\pcyipsii(y_i | \psi_i ; a^2))$ with respect to $a^2$ are straightforward to compute. Derivatives of $\log(\ppsii(\psi_i;\psi_{\rm pop},\Omega))$ with respect to $\psi_{\rm pop}$ and $\Omega$ remain unchanged.



Man02.jpg
Example 3


Consider again the same model for continuous data, assuming now that a subset $\xi$ of the parameters of the structural model has no variability:

\(\begin{eqnarray} y_{ij} | \psi_i &\sim& {\cal N}(f(t_{ij}, \psi_i,\xi) \ , \ a^2), \ \ 1 \leq j \leq n_i \\ h(\psi_i) &\sim_{i.i.d}& {\cal N}( h(\psi_{\rm pop}) , \Omega). \end{eqnarray}\)

Let $\psi$ remain as the subset of individual parameters with variability. Here, $\theta_y=(\xi,a^2)$, $\theta_\psi=(\psi_{\rm pop},\Omega)$, and

\( \log(\pcyipsii(y_i | \psi_i ; \xi,a^2)) =-\displaystyle{\frac{n_i}{2} }\log(2\pi)- \displaystyle{\frac{n_i}{2} }\log(a^2) - \displaystyle{\frac{1}{2 a^2} }\sum_{j=1}^{n_i}(y_{ij} - f(t_{ij}, \psi_i,\xi))^2 . \)

Derivatives of $\log(\pcyipsii(y_i | \psi_i ; \xi, a^2))$ with respect to $\xi$ require computation of the derivative of $f$ with respect to $\xi$. These derivatives are usually not calculable. One possibility is to numerically approximate them using finite differences.



Estimation using linearization of the model

Consider here a model for continuous data that uses a $\phi$-parametrization for the individual parameters:

\(\begin{eqnarray} y_{ij} &= & f(t_{ij} , \phi_i) + g(t_{ij} , \phi_i)\teps_{ij} \\ \phi_i &=& \phi_{\rm pop} + \eta_i . \end{eqnarray}\)

Let $\hphi_i$ be some predicted value of $\phi_i$, such as for instance the estimated mean or estimated mode of the conditional distribution $\pmacro(\phi_i |y_i ; \hat{\theta})$.

We can then choose to linearize the model for the observations $(y_{ij}, 1\leq j \leq n_i)$ of individual $i$ around the vector of predicted individual parameters. Let $\Dphi{f(t , \phi)}$ be the row vector of derivatives of $f(t , \phi)$ with respect to $\phi$. Then,

\(\begin{eqnarray} y_{ij} &\simeq& f(t_{ij} , \hphi_i) + \Dphi{f(t_{ij} , \hphi_i)} \, (\phi_i - \hphi_i) + g(t_{ij} , \hphi_i)\teps_{ij} \\ &\simeq& f(t_{ij} , \hphi_i) + \Dphi{f(t_{ij} , \hphi_i)} \, (\phi_{\rm pop} - \hphi_i) + \Dphi{f(t_{ij} , \hphi_i)} \, \eta_i + g(t_{ij} , \hphi_i)\teps_{ij} . \end{eqnarray}\)

Then, we can approximate the marginal distribution of the vector $y_i$ as a normal distribution:

\( y_{i} \approx {\cal N}\left(f(t_{i} , \hphi_i) + \Dphi{f(t_{i} , \hphi_i)} \, (\phi_{\rm pop} - \hphi_i) , \Dphi{f(t_{i} , \hphi_i)} \Omega \Dphi{f(t_{i} , \hphi_i)}^{\transpose} + g(t_{i} , \hphi_i)\Sigma_{n_i} g(t_{ij} , \hphi_i)^{\transpose} \right), \)
(5)

where $\Sigma_{n_i}$ is the variance-covariance matrix of $\teps_{i,1},\ldots,\teps_{i,n_i}$. If the $\teps_{ij}$ are i.i.d., then $\Sigma_{n_i}$ is the identity matrix.

We can equivalently use the original $\psi$-parametrization and the fact that $\phi_i=h(\psi_i)$. Then,

\( \Dphi{f(t_{i} , \hphi_i)} = \Dpsi{f(t_{i} , \hpsi_i)} J_h(\hpsi_i)^{\transpose} , \)

where $J_h$ is the Jacobian of $h$.

We then can approximate the observed log-likelihood ${\llike}(\theta) = \log(\like(\theta;\by))=\sum_{i=1}^N \log(\pyi(y_i;\theta))$ using this normal approximation. We can also derive the F.I.M. by computing the matrix of second-order partial derivatives of ${\llike}(\theta)$.

Except for very simple models, computing these second-order partial derivatives in closed form is not straightforward. In such cases, finite differences can be used for numerically approximating them. We can use for instance a central difference approximation of the second derivative of $\llike(\theta)$. To this end, let $\nu>0$. For $j=1,2,\ldots, m$, let $\nu^{(j)}=(\nu^{(j)}_{k}, 1\leq k \leq m)$ be the $m$-vector such that

\( \nu^{(j)}_{k} = \left\{ \begin{array}{ll} \nu & {\rm if \quad j= k} \\ 0 & {\rm otherwise.} \end{array} \right. \)

Then, for $\nu$ small enough,

\(\begin{eqnarray} \partial_{\theta_j}{ {\llike}(\theta)} &\approx& \displaystyle{ \frac{ {\llike}(\theta+\nu^{(j)})- {\llike}(\theta-\nu^{(j)})}{2\nu} } \\ \end{eqnarray}\)

\(\begin{eqnarray} \partial^2_{\theta_j,\theta_k}{ {\llike}(\theta)} &\approx& \displaystyle{\frac{ {\llike}(\theta+\nu^{(j)}+\nu^{(k)})- {\llike}(\theta+\nu^{(j)}-\nu^{(k)}) -{\llike}(\theta-\nu^{(j)}+\nu^{(k)})+{\llike}(\theta-\nu^{(j)}-\nu^{(k)})}{4\nu^2} } . \end{eqnarray}\)
(6)






In summary, for a given estimate $\hat{\theta}$ of the population parameter $\theta$, the algorithm for approximating the Fisher Information Matrix $I(\hat{\theta)}$ using a linear approximation of the model consists of:

1. For $i=1,2,\ldots,N$, obtain some estimate $(\hpsi_i)$ of the individual parameters $(\psi_i)$ (we can average for example the final terms of the sequence $(\psi_i^{(k)})$ drawn during the final iterations of the SAEM algorithm).

2. For $i=1,2,\ldots,N$, compute $\hphi_i=h(\hpsi_i)$, the mean and the variance of the normal distribution defined in (5), and ${\llike}(\theta)$ using this normal approximation.

3. Use (6) to approximate the matrix of second-order derivatives of ${\llike}(\theta)$.


Back.png
Forward.png