LaTeX source for Answers to Bayesian Statistics: An Introduction (4th edn)
\documentclass[oneside]{book} \usepackage{times} \usepackage{amsmath} \usepackage{amssymb} \usepackage{amsbsy} \usepackage{makeidx} \usepackage{epsf} \usepackage{textcomp} \usepackage{verbatim} \usepackage{euscript} \setcounter{secnumdepth}{1} \setcounter{tocdepth}{1} % Set up environment for exercises at ends of chapters \newcounter{qno} \newcommand{\startqs}{\setcounter{qno}{0}\vspace{-1.5\baselineskip}} \newcommand{\nextq} {\vspace{1.5\baselineskip}\noindent\addtocounter{qno}{1}\arabic{qno}.\quad} % Allow for blank lines \newcommand{\blankline}{\vspace{\baselineskip}\noindent} % Define digitwidth and dotwidth (TeXbook p. 241) \newdimen\digitwidth \setbox0=\hbox{\rm0} \digitwidth=\wd0 \newdimen\dotwidth \setbox0=\hbox{\rm.} \dotwidth=\wd0 % Notation for vectors, matrices, estimates, random variables and sample means \newcommand{\vect}{\boldsymbol} \newcommand{\matr}{\boldsymbol} \newcommand{\est}{\widehat} \newcommand{\random}{\widetilde} \newcommand{\mean}{\overline} % Notation for dots in subscripts \newcommand {\bdot}{\hbox{\Huge .}} \newcommand {\dotdot}{{\hbox{\Huge .}\kern-0.1667em\hbox{\Huge .}}} \newcommand {\onedot}{1\kern-0.1667em\bdot} \newcommand {\twodot}{2\kern-0.1667em\bdot} \newcommand {\idot}{i\kern-0.1667em\bdot} \newcommand {\jdot}{j\kern-0.1667em\bdot} \newcommand {\mdot}{m\kern-0.1667em\bdot} \newcommand {\dotj}{\kern-0.1667em\bdot\kern-0.1667em j} % Define sech, arc sin and arc cos \newcommand{\sech}{\operatorname{sech}} \renewcommand{\arcsin}{\operatorname{arc\,sin}} \renewcommand{\arccos}{\operatorname{arc\,cos}} % Define Probability, Expectation, Variance, Covariance, Median, Mode \renewcommand{\Pr}{\mbox{$\mathsf P$}} \newcommand{\E}{\mbox{$\mathsf E$}} \newcommand{\Var}{\mbox{$\mathcal V$}} \newcommand{\Cov}{\mbox{$\mathcal C$}} \newcommand{\median}{\mbox{median\,}} \newcommand{\mode}{\mbox{mode\,}} % Define notation for evidence \newcommand{\Ev}{\mbox{Ev}} % Define small common fractions for use in display formulae \newcommand{\half}{\mbox{$\frac{1}{2}$}} \newcommand{\smallhalf}{\mbox{\small$\frac{1}{2}$}} \newcommand{\quarter}{\mbox{$\frac{1}{4}$}} \newcommand{\third}{\mbox{$\frac{1}{3}$}} \newcommand{\twothirds}{\mbox{$\frac{2}{3}$}} \newcommand{\ninth}{\mbox{$\frac{1}{9}$}} \newcommand{\twofifths}{\mbox{$\frac{2}{5}$}} % Type for setting pseudo-code \newcommand{\prog}{\texttt} % Notation for the R project \newcommand{\R}{\textsf{R}} % Script I for a (Kullback-Leibler) information measure \newcommand{\I}{\mbox{$\EuScript I$}} \newcommand{\ABCREJ}{\mbox{\textit{ABC-REJ\/}}} % Alternative notation for fractions (TeXbook, exercise 11.6) \newcommand{\slopefrac}[2]{\leavevmode\kern.1em \raise .5ex\hbox{\the\scriptfont0 #1}\kern-.1em /\kern-.15em\lower .25ex\hbox{\the\scriptfont0 #2}} % Notation for beta funcion \newcommand{\Betafn}{\mbox{B}} % Define names of distributions \newcommand{\N}{\mbox{N}} % A.1 \newcommand{\G}{\mbox{G}} % A.4 \newcommand{\Ex}{\mbox{E}} % A.4 \renewcommand{\t}{\mbox{t}} % A.8 \newcommand{\Be}{\mbox{Be}} % A.10 \newcommand{\B}{\mbox{B}} % A.11 \renewcommand{\P}{\mbox{P}} % A.12 \newcommand{\NB}{\mbox{NB}} % A.13 \renewcommand{\H}{\mbox{H}} % A.14 \newcommand{\U}{\mbox{U}} % A.15 \newcommand{\UD}{\mbox{UD}} % A.15 \newcommand{\Pa}{\mbox{Pa}} % A.16 \newcommand{\Pabb}{\mbox{Pabb}} % A.16 \newcommand{\M}{\mbox{M}} % A.17 \newcommand{\BF}{\mbox{BF}} % A.18 \newcommand{\F}{\mbox{F}} % A.19 \newcommand{\z}{\mbox{z}} % A.20 \newcommand{\C}{\mbox{C}} % A.21 % Define some common bold symbols \newcommand{\bbeta}{\mbox{$\boldsymbol\beta$}} \newcommand{\beeta}{\mbox{$\boldsymbol\eta$}} \newcommand{\btheta}{\mbox{$\boldsymbol\theta$}} \newcommand{\bkappa}{\mbox{$\boldsymbol\kappa$}} \newcommand{\blambda}{\mbox{$\boldsymbol\lambda$}} \newcommand{\bmu}{\mbox{$\boldsymbol\mu$}} \newcommand{\btau}{\mbox{$\boldsymbol\tau$}} \newcommand{\bzero}{\mbox{$\boldsymbol0$}} % Further bold symbols for use in connection with hierachical models \newcommand {\bpiem}{\mbox{\boldmath $\pi^{EM}$}} \newcommand {\bhtheta}{\mbox{\boldmath $\est\theta$}} \newcommand {\bhthetao}{\mbox{\boldmath $\est\theta^{\mbox{\scriptsize\it0}}$}} \newcommand {\bhthetajs}{\mbox{\boldmath $\est\theta^{JS}$}} \newcommand {\bhthetajsplus}{\mbox{\boldmath $\est\theta^{JS^{{}_+}}$}} \newcommand {\bhthetaem}{\mbox{\boldmath $\est\theta^{EM}$}} \newcommand {\bhthetab}{\mbox{\boldmath $\est\theta^{B}$}} \newcommand {\bhthetaeb}{\mbox{\boldmath $\est\theta^{EB}$}} \newcommand {\thetabar}{\mbox{$\mean\theta$}} \newcommand {\bphi}{\mbox{\boldmath $\phi$}} \newcommand {\BPhi}{\mbox{\boldmath $\Phi$}} \newcommand {\bpsi}{\mbox{\boldmath $\psi$}} \newcommand {\BPsi}{\mbox{\boldmath $\Psi$}} \newcommand {\BSigma}{\mbox{\boldmath $\Sigma$}} % Define transpose for matrix theory \newcommand{\transpose}{\mbox{${}^{\text{T}}$}} % Define differentials with roman d and thin space before \renewcommand{\d}{\mbox{d}} \newcommand{\dF}{\,\mbox{\d$F$}} \newcommand{\dt}{\,\mbox{\d$t$}} \newcommand{\du}{\,\mbox{\d$u$}} \newcommand{\dU}{\,\mbox{\d$U$}} \newcommand{\dx}{\,\mbox{\d$x$}} \newcommand{\dy}{\,\mbox{\d$y$}} \newcommand{\dz}{\,\mbox{\d$z$}} \newcommand{\dgamma}{\,\mbox{\d$\gamma$}} \newcommand{\dzeta}{\,\mbox{\d$\zeta$}} \newcommand{\deta}{\,\mbox{d$\eta$}} \newcommand{\dtheta}{\,\mbox{\d$\theta$}} \newcommand{\dbtheta}{\,\mbox{\d$\boldsymbol\theta$}} \newcommand{\dkappa}{\,\mbox{\d$\kappa$}} \newcommand{\dlambda}{\,\mbox{\d$\lambda$}} \newcommand{\dLambda}{\,\mbox{\d$\Lambda$}} \newcommand{\dmu}{\,\mbox{\d$\mu$}} \newcommand{\dbmu}{\,\mbox{\d$\bmu$}} \newcommand{\drho}{\,\mbox{\d$\rho$}} \newcommand{\dpi}{\,\mbox{\d$\pi$}} \newcommand{\dxi}{\,\mbox{\d$\xi$}} \newcommand{\dphi}{\,\mbox{\d$\phi$}} \newcommand{\dpsi}{\,\mbox{\d$\psi$}} \newcommand{\domega}{\,\mbox{\d$\omega$}} % Hyp for hypothesis \newcommand{\Hyp}{\mbox{H}} % Blackboard bold Z for the integers \newcommand{\Z}{\mbox{$\mathbb Z$}} % Script X for a set of possible observations \newcommand{\X}{\mbox{$\mathcal X$}} % EM, GEM, E-step and M-step for the EM algorithm \newcommand{\EM}{\mbox{\textit{EM}\ }} \newcommand{\GEM}{\mbox{\textit{GEM}\ }} \newcommand{\Estep}{\mbox{\textit{E}-step\ }} \newcommand{\Mstep}{\mbox{\textit{M}-step\ }} % Omit the word Chapter at the start of chapters \renewcommand{\chaptername}{} \begin{document} \thispagestyle{empty} \begin{flushleft} {\Huge{\textbf{Bayesian Statistics:}}}\\ \vspace{10 mm} {\Large{\textbf{An Introduction}}}\\ \vspace{20 mm} {\huge{\textbf{PETER M. LEE}}}\\ \vspace{3 mm} {\large{\textit{\textbf{\mbox{\noindent Formerly Provost of Wentworth College,}}}}}\\ {\large{\textit{\textbf{\mbox{\noindent University of York, England}}}}}\\ \vspace{15mm} {\Large{\textit{\textbf{Third Edition}}}}\\ \vspace{55mm} A member of the Hodder Headline Group \\ LONDON $\bullet$ SYDNEY $\bullet$ AUCKLAND \\ \vspace{3 mm} Copublished in North, Central and South America by \\ John Wiley \& Sons Inc., New York $\bullet$ Toronto \end{flushleft} \vfill \mainmatter \appendix \pagestyle{headings} \setcounter{chapter}{3} \allowdisplaybreaks \chapter{Answers to Exercises} \pagestyle{plain} \section {Exercises on Chapter \arabic{section}} \startqs \nextq Considering trumps and non-trumps separately, required probability is \[ 2\binom{3}{3}\binom{23}{10}\left/\binom{26}{13}\right.=\frac{11}{50}. \] Probability of a $2:1$ split is 39/50, and the conditional probability that the king is the odd one is 1/3, so probability one player has the king and the other the remaining two is 13/50. \nextq \!\!(a) If $\Pr(A)=0$ or $\Pr(A)=1$. \blankline (b) Take $A$ as ``odd red die'', $B$ as ``odd blue die'' and $C$ as ``odd sum''. \blankline (c) Take $C$ as $\{HHH,\, THH,\, THT,\, TTH\}$. \nextq \!\!(a)\quad$\Pr(\text{homozygous})=1/3;\quad \Pr(\text{heterozygous})=2/3.$ \blankline (b)\quad By Bayes' Theorem \[ \Pr(BB\,|\,\text{7 black}) = \frac{(1/3)(1)}{(1/3)(1)+(2/3)(1/2^7)} = \frac{64}{65}. \] \blankline (c)\quad Similarly we see by induction (wih case $n=0$ given by part (a)) that $\Pr(BB\,|\,\text{first $n$ black})=2^{n-1}/(2^{n-1}+1)$ since \begin{align*} \Pr(BB\,|\,\text{first $n+1$ black}) & = \frac{\{2^{n-1}/(2^{n-1}+1)\}(1)} {\{2^{n-1}/(2^{n-1}+1)\}(1)+\{1/(2^{n-1}+1)\}(1/2)} \\ & = \frac{2^n}{2^n+1}. \end{align*} \nextq Use $\Pr(GG)=\Pr(M)\Pr(GG|M)+\Pr(D)\Pr(GG|D)$ to get \[ \Pr(M)=\frac{\Pr(GG)-p^2}{p(1-p)} \] \nextq $p(3)=p(4)=1/36$, $p(5)=p(6)=2/36$, $p(7)=\dots=p(14)=3/36$, $p(15)=p(16)=2/36$, $p(17)=p(18)=1/36$. As for the distribution function, \[ F(x)=\left\{\begin{array}{ll} 0 & \text{for $x < 3$;} \\ 1/36 & \text{for $3\leqslant x < 4$;} \\ 2/36 & \text{for $4\leqslant x < 5$;} \\ 4/36 & \text{for $5\leqslant x < 6$;} \\ 6/36 & \text{for $6\leqslant x < 7$;} \\ (3[x]-12)/36 & \text{for $n\leqslant x < n+1$ where $7\leqslant n < 15$;} \\ 32/36 & \text{for $15\leqslant x < 16$;} \\ 34/36 & \text{for $16\leqslant x < 17$;} \\ 35/36 & \text{for $17\leqslant x < 18$;} \\ 1 & \text{for $x\geqslant18$} \end{array}\right. \] where $[x]$ denotes the integer part of $x$. \nextq $\Pr(k=0)=(1-\pi)^n=(1-\lambda/n)^n\to\text{e}^{-\lambda}$. More generally \begin{align*} p(k) & = \binom{n}{k}\pi^k(1-\pi)^{n-k} \\ & = \frac{\lambda^k}{k!}\left(1-\frac{\lambda}{n}\right)^n \frac{n}{n}\frac{n-1}{n}\dots\frac{n-k+1}{n} \left(1-\frac{\lambda}{n}\right)^{-k} \\ & \to\frac{\lambda^k}{k!}\exp(-\lambda). \end{align*} \nextq \!\!(a) $\Pr\bigl(\,\random k=0\bigr)=\Pr(\random m=\random n=0)= \Pr(\random m=0)\,\Pr(\random n=0)= \text{e}^{-\lambda}\text{e}^{-\mu}=\text{e}^{-(\lambda+\mu)}$, \begin{align*} \Pr(\,\random k=1\bigr) & = \Pr\bigl(\{\random m=1,\,\random n=0\}\ \text{or}\ \{\random m=0,\,\random n=1\}\bigr) \\ & = \lambda\text{e}^{-(\lambda+\mu)}+ \mu\text{e}^{-(\lambda+\mu)}= (\lambda+\mu)\text{e}^{-(\lambda+\mu)}. \end{align*} \blankline (b) More generally \begin{align*} \Pr\bigl(\,\random k=k\bigr) & = \sum_{m=0}^k \Pr(\random m=m,\,\random n=k-m)= \sum_{m=0}^k \Pr(\random m=m)\,\Pr(\random n=k-m) \\ & = \sum_{m=0}^k \frac{\lambda^k}{k!}\exp(-\lambda) \frac{\mu^{k-m}}{(k-m)!}\exp(-\mu) \\ & = \frac{1}{k!}\exp(-(\lambda+\mu)) \sum_{m=0}^{k} \binom{k}{m}\lambda^m\mu^{k-m}= \frac{(\lambda+\mu)^k}{k!}\exp(-(\lambda+\mu)). \end{align*} \blankline (c) By definition of conditional probability \begin{align*} \Pr\bigl(\random m=m\,|\,\random k=k\bigr) & = \frac{\Pr\bigl(\random m=m,\,\random k=k\bigr)} {\Pr\bigl(\random k=k\bigr)}= \frac{\Pr\bigl(\random m=m,\phantom{\random k}\random n=k-m\bigr)} {\Pr\bigl(\,\random k=k\bigr)} \\ & = \frac{\frac{\lambda^m}{m!}\exp(-\lambda))\, \frac{\mu^{k-m}}{(k-m)!}\exp(-\mu)} {\frac{(\lambda+\mu)^k}{k!}\exp(-(\lambda+\mu))} \\ & = \binom{k}{m}\left(\frac{\lambda}{\lambda+\mu}\right)^m \left(1-\frac{\lambda}{\lambda+\mu}\right)^{k-m}. \end{align*} \nextq Let $y=x^2$ where $x\sim\N(0,1)$. Then \begin{align*} \Pr\left(\random y\leqslant y\right) & = \Pr\left(\random x^2\leqslant y\right)= \Pr\left(-\sqrt{y}\leqslant\random x\leqslant\sqrt{y}\,\right) \\ & = \Pr\left(\random x\leqslant\sqrt{y}\,)-\Pr(\random x < -\sqrt{x}\,\right) \end{align*} so that (because $\Pr(\random x=-\sqrt{y})=0$) \[ F_{\random y}(y) = F_{\random x}\left(\sqrt{y}\,\right) - F_{\random x}\left(-\sqrt{y}\,\right) \] and on differentiation \[ p_{\random y}(y)=\half y^{-\frac{1}{2}}p_{\random x}\left(\sqrt{y}\,\right)+ \half y^{-\frac{1}{2}}p_{\random x}\left(-\sqrt{y}\,\right). \] Alternatively, you could argue that \[ \Pr(y < \random y \leqslant y + \dy) = \Pr(x < \random x \leqslant x + \dx) + \Pr(-x -\dx \leqslant \random x < -x) \] implying \[ p_{\random y}(y)\dy = p_{\random x}(x)\dx + p_{\random x}(-x)\dx \] which as $\dy/\dx = 2x = 2\sqrt{y}$ leads to the same conclusion. In the case where $x\sim\N(0,1)$ this gives \[ p_{\random y}(y)=(2\pi y)^{-\frac{1}{2}}\exp\left(-\half y\right) \] which is the density of $\chi_1^2$. \nextq By independence, since $\random M \leqslant M$ iff every one of the indvidual $X_i$ are less than or equal to $M$ \[ \begin{array}{ll} F_M(M)=\Pr(X_i\leqslant M \quad\forall i) = (F(M))^n; & F_m(m)=1-(1-F(m))^n; \\ p_M(M)=nf(M)(F(M))^{n-1}; & p_m(m)=nf(m)(1-F(m))^{n-1} \end{array} \] \nextq Let $m$ be the minimum of $u$ and $v$ and $c$ be the length of the centre section. Then $p(m, c) =p_{\random u,\,\random v}(m,c)+p_{\random u,\,\random v}(c,m) =2$ if $m+c\leqslant1$ and 0 otherwise. \nextq If $F(x,y)=F(x)F(y)$ then by differentiation with respect to $x$ and with respect to $y$ we have $p(x,y)=p(x)p(y)$. The converse follows by integration. In discrete cases if $F(x,y)=F(x)F(y)$ then $p(x,y)=F(x,y)-F(x-1,y)-F(x,y-1)+F(x-1,y-1)= (F_{\random x}(x)-F_{\random x}(x-1))(F_{\random y}(y)-F_{\random y}(y-1))= p(x)p(y)$. Coversely if $p(x,y)=p(x)p(y)$ then $F(x,y)=\sum_{\xi\leqslant x}\sum_{\eta\leqslant y} p_{\random x,\random y}(\xi,\eta)$ and so $F(x,y)=\sum_{\xi\leqslant x}p_{\random x}(\xi) \sum_{\eta\leqslant y}p_{\random y}(\eta)=F(x)F(y)$. \nextq $\E X=n(1-\pi)/\pi;\ \ \Var X=n(1-\pi)\pi^2.$ \nextq $\E X=\sum\E Z_i^2=\sum 1=\nu$, while $\E X^2=\sum\E Z_i^4+\sum_{i\ne j}\E X_i^2X_j^2=3\nu+\nu(\nu-1)$ so that $\Var Z=\E X^2-(\E X)^2=2\nu$. Similarly by integration. \nextq $\E X=n\pi$, $\E X(X-1)=n(n-1)\pi^2$ and $\E X(X-1)(X-2)=n(n-1)(n-2)\pi^3$, so $\E X=n\pi$, $\E X^2=\E X(X-1)+\E X=n(n-1)\pi+n\pi$ and \begin{align*} \E(X-\E X)^3 & = \E X^3-3(\E X^2)(\E X)+2(\E X)^3 \\ & = \E X(X-1)(X-2)-3(\E X^2)(\E X)+3\E X^2+2(\E X)^3-2\E X \\ & = n\pi(1-\pi)(1-2\pi), \end{align*} and thus $\gamma_1=(1-2\pi)/\sqrt{[n\pi(1-\pi)]}$. \nextq $\phi\geqslant \int_{\{x;\,|x-\mu|\geqslant c\}}c^2p(x)\dx =c^2\Pr(|x-\mu|\geqslant c)$. \nextq By symmetry $\E x=\E y=\E xy=0$ so $\Cov(x,y)=\E xy-\E x\E y=0$. However $0=\Pr(x=0,\,y=0)\ne\Pr(x=0)\,\Pr(y=0)=\half\times\half=\quarter$. \nextq $p(y|x)=\{2\pi(1-\rho^2)\}^{-\frac{1}{2}} \exp\{-\half(y-\rho x)^2/(1-\rho^2)\}$ so conditional on $\random x=x$ we have $y\sim\N(\rho x,1-\rho^2)$, so $\E (y|x)=\rho x$. $\E (y^2|x)=\Var (y|x)+\{\E(y|x)\}^2=1-\rho^2+\rho^2x^2$, hence $\E (xy|x)=\rho x^2$, $\E(x^2y^2|x)=x^2-\rho^2x^2+\rho^2x^4$. Therefore $\E xy=\rho$ and (as $\E x^4=3$) $\E x^2y^2=1+2\rho^2$, so $\E xy-(\E x)(\E y)=\rho$ and $\E x^2y^2-(\E x^2)(\E y^2)=2\rho^2$. As $\Var x=1$ and $\Var x^2=\E x^4-(\E x^2)^2=3-1=2=\Var y$, the result follows. \nextq \!\!(a)\quad $p(x,y)= (\lambda^x\text{e}^{-x}/x!)\binom{x}{y}\pi^y(1-\pi)^{x-y}$ so adding over $x=y, y+1, \dots$ and using $\sum\lambda^{x-y}(1-\pi)^{x-y}=\text{e}^{\lambda(1-\pi)}$ we get $p(y)=(\lambda\pi)^y\text{e}^{-\lambda\pi}/y!$ so that $\random y\sim\P(\lambda\pi)$. Now note that $\E_{\random y|\random x}(\random y|\random x)=x\pi$ and this has expectation $\lambda\pi$. \blankline (b)\quad Note that $\Var_{\random y|\random x}(\random y|\random x)=x\pi(1-\pi)$ which has expectation $\lambda\pi(1-\pi)$ and that $\E_{\random y|\random x}(\random y|\random x)=x\pi$ which has variance $\lambda\pi^2$ so that the right hand side adds to $\lambda\pi$, the variance of $\random y$. \nextq We note that \[ I=\int_{0}^{\infty}\exp(-\half z^2)\,dz= \int_{0}^{\infty}\exp(-\half(xy)^2)\,y\,dx \] for any $y$ (on setting $z=xy$). Putting $z$ in place of $y$, it follows that for any $z$ \[ I = \int_{0}^{\infty}\exp(-\half(zx)^2)\,z\,dx \] so that \[ I^2=\left(\int_{0}^{\infty}\exp(-\half z^2)\,dz\right) \left(\int_{0}^{\infty}\exp(-\half(zx)^2)\,dx\right) =\int_{0}^{\infty}\int_{0}^{\infty} \exp\{-\half(x^2+1)z^2\}\,z\,dz\,dx. \] Now set $(1+x^2)z^2=2t$ so that $z\,dz=dt/(1+x^2)$ to get \begin{align*} I^2&=\int_{0}^{\infty}\int_{0}^{\infty} \exp(-t)\,\frac{dt}{(1+x^2)}\,dx =\left(\int_{0}^{\infty}\exp(-t)\,dt\right) \left(\int_{0}^{\infty}\frac{dx}{(1+x^2)}\right) \\ &=\left[-\exp(-t)\right]_0^{\infty}\, \left[\tan^{-1}x\right]_0^{\infty} =\bigl[1\bigr]\bigl[\half\pi\bigr] \\ &=\frac{\pi}{2} \end{align*} and hence $I=\sqrt{\pi/2}$ so that the integral of $\phi$ from $-\infty$ to $\infty$ is 1, and hence $\phi$ \textit{is} a probability density function. This method is apparently due to Laplace (1812, Section 24, pages 94--95 in the first edition). \section {Exercises on Chapter \arabic{section}} \startqs \nextq $p(\pi)=\bigl(\Betafn(k+1,n-k+1)\bigr)^{-1}\pi^k(1-\pi)^{n-k}$ or $\pi\sim\Be(k+1,n-k+1)$. \nextq $\mean x=16.35525$, so assuming uniform prior, posterior is $\N(16.35525,\,1/12)$. A 90\% HDR is $16.35525\pm1.6449/\sqrt{12}$ or $16.35525\pm0.47484$, that is, the interval $(15.88041, 16.83009)$. \nextq $x-\theta\sim\N(0,1)$ and $\theta\sim\N(16.35525,\,1/12)$, so $x\sim\N(16.35525,\,13/12)$. \nextq Assumming uniform prior, posterior is $\N(\mean x,\phi/n)$, so take $n=k$. If prior variance is $\phi_0$, posterior is $\{1/\phi_0+n/\phi\}^{-1}$, so take $n$ the least integer such that $n\geqslant(k-1)\phi/\phi_0$. \nextq Posterior is $k(2\pi/25)^{-\frac{1}{2}}\exp\{-\half(\theta-0.33)^2\times25\}$ for $\theta > 0$ and 0 for $\theta < 0$. Integrating we find $1=k\Pr(X > 0)$ where $X\sim\N(0.33,\,1/25)$, so $k=\{1-\Phi(-1.65)\}^{-1}=1.052$. We now seek $\theta_1$ such that $p(\theta\leqslant\theta_1|\vect x)=0.95$ or equivalently $k\Pr(0 < X\leqslant\theta_1)=1$ with $k$ and $X$ as before. This results in $0.95=1.052\{\Phi(5\theta_1-1.65)-\Phi(-1.65)\}$, so $\Phi(5\theta_1-1.65)=0.95/1.052+0.0495=0.9525$ leading to $5\theta_1-1.65=1.67$ and so to $\theta_1=0.664$. The required interval is thus $[0, 0.664)$.n \nextq From the point of view of someone starting from prior ignorance, my beliefs are equivalent to an observation of mean $\lambda$ and variance $\phi$ and yours to one of mean $\mu$ and variance $\psi$, so after taking into account my beliefs such a person is able to use Section 2.2 and conclude $\theta\sim\N(\lambda_1,\phi_1)$ where $1/\phi_1=1/\phi+1/\psi$ and $\lambda_1/\phi_1=\lambda/\phi+\mu/\psi$. \nextq The likelihood (after inserting a convenient constant) is \[ l(\vect x|\theta)=(2\pi\phi/n)^{-\frac{1}{2}} \exp\{-\half (\mean x-\theta)^2/(\phi/n)\}. \] Hence by Bayes' Theorem, within $I_{\alpha}$ \begin{gather*} Ac(1-\varepsilon)(2\pi\phi/n)^{-\frac{1}{2}} \exp\{-\half(\mean x-\theta)^2/(\phi/n)\} \leqslant p(\theta|\vect x) \\ \leqslant Ac(1+\varepsilon)(2\pi\phi/n)^{-\frac{1}{2}} \exp\{-\half(\mean x-\theta)^2/(\phi/n)\} \end{gather*} and outside $I_{\alpha}$ \[ 0\leqslant p(\theta|\vect x)\leqslant AMc(2\pi\phi/n)^{-\frac{1}{2}} \exp\{-\half(\mean x-\theta)^2/(\phi/n)\}, \] where $A$ is a constant equal to $p(\vect x)^{-1}$. Using the right hand inequality for the region inside $I_{\alpha}$ we get \begin{align*} \int_{I_{\alpha}} p(\theta|\vect x)\dtheta & \leqslant Ac(1+\varepsilon) \int_{I_{\alpha}} (2\pi\phi/n)^{-\frac{1}{2}} \exp\{-\half(\mean x-\theta)^2/(\phi/n)\}\dtheta \\ & = Ac(1+\varepsilon)\int_{-\lambda_{\alpha}}^{\lambda_{\alpha}} (2\pi)^{-\frac{1}{2}}\exp(-\half t^2)\dt,\text{\ where\ } t=(\mean x-\theta)/\sqrt{\phi/n} \\ & = Ac(1+\varepsilon)[\Phi(\lambda_{\alpha})-\Phi(-\lambda_{\alpha})] =Ac(1+\varepsilon)(1-\alpha) \end{align*} since $\Phi(-\lambda_{\alpha})=1-\Phi(\lambda_{\alpha})$. Similarly, the same integral exceeds $AQc(1-\varepsilon)(1-\alpha)$, and, if $J_{\alpha}$ is the outside of $I_{\alpha}$, \[ 0\leqslant \int_{J_{\alpha}} p(\theta|\vect x)\dtheta \leqslant AMc\alpha. \] Combining these results we have, since $ \int_{I_{\alpha}\cup J_{\alpha}} p(\theta|\vect x)\dtheta=1$, \[ Ac(1-\varepsilon)(1-\alpha)\leqslant1\leqslant Ac[(1+\varepsilon)(1-\alpha)+M\alpha], \] and hence \[ \frac{1}{(1+\varepsilon)(1-\alpha)+M\alpha}\leqslant Ac\leqslant \frac{1}{(1-\varepsilon)(1-\alpha)}. \] The result now follows on remarking that the maximum value of the exponential in $J_{\alpha}$ occurs at the end-points $\theta\pm \lambda_{\alpha}\sqrt{\phi/n}$, where it has the value $\exp(-\half\lambda_{\alpha}^2)$. \nextq The likelihood is \begin{align*} l(\theta|\vect x) & \propto h(\theta)^n \exp\left\{\sum t(x_i)\psi(\theta)\right\} \\ & \propto h(\theta)^n \exp\left\{\exp\left[\log\psi(\theta)- \log\left(1\left/\,\sum t(x_i)\right.\right)\right]\right\}. \end{align*} This is of the data-translated form $g(\psi(\theta)-t(\vect x))$ \textit{if} the function $h(x)\equiv1$. \nextq Take prior $S_0\chi_{\nu}^{-2}$ where $\nu=8$ and $S_0/(\nu-2)=100$ so $S_0=600$ and take $n=30$ and $S=13.2^2(n-1)=5052.96$. Then (cf.\ Section 2.7) posterior is $(S_0+S)\chi_{\nu+n}^{-2}$ where $\nu+n=38$ and $S_0+S=5652.96$. Values of $\chi^2$ corresponding to a 90\% HDR for log $\chi_{38}^2$ are (interpolating between 35 and 40 degrees of freedom) 25.365 and 54.269 so 90\% interval is $(104,223)$. \nextq $n=10$, $\mean x=5.032$, $S=3.05996$, $s/\sqrt{n}=0.1844$. Posterior for mean $\theta$ is such that $(\theta-5.032)/0.1844\sim\t_{\nu}$, so as $\t_{9,0.95}=1.833$ a 90\% HDR is $(4.69,5.37)$. Posterior for variance $\phi$ is $S\chi_{\nu}^{-2}$, so as values of $\chi^2$ corresponding to a 90\% HDR for $\log\chi_9^2$ are 3.628 and 18.087 required interval is $(0.17,0.84)$. \nextq Sufficient statistic is $\sum_{i=1}^n x_i$, or equivalently $\mean x$. \nextq Sufficient statistic is $\left(\sum_{i=1}^n x_i, \prod_{i=1}^n x_i\right)$ or equivalently $(\mean x,\check x)$ where $\check x$ is the geometric mean. \nextq $p(\beta)\propto\beta^{-\alpha_0}\exp(-\xi/\beta)$. \nextq If $\theta\sim\U(-\pi/2,\pi/2)$ and $x=\tan\theta$, then by the usual change-of-variable rule $p(x)=\pi^{-1}|\!\dtheta/\dx|=\pi^{-1}\left(1+x^2\right)^{-1}$, and similarly if $\theta\sim\U(\pi/2,3\pi/2)$. The result follows. \nextq Straightforwardly \begin{align*} p(\vect x|\vect \pi) & = \frac{n!}{x!\,y!\,z!}\pi^x\rho^y\sigma^z \\ & = \left(\frac{n!}{x!\,y!\,(n-x-y)!}\right) \times\exp\{n\log(1-\pi-\rho)\} \\ & \qquad\times\exp[x\log\{\pi/(1-\pi-\rho\}+y\log\{\rho/(1-\pi-\rho)\}] \\ & = g(x,y)\times h(\pi,\rho)\times \exp[t(x,y)\psi(\pi,\rho)+u(x,y)\chi(\pi,\rho)] \end{align*} \nextq $\nu_1=\nu_0+n=104$; $n_1=n_0+n=101$; $\theta_1=(n_0\theta_0+n\mean x)/n_1=88.96$; $S=(n-1)s^2=2970$; $S_1=S_0+S+(n_0^{-1}+n^{-1})^{-1}(\theta_0-\mean x)^2=3336$; $s_1/\sqrt{n_1}=\sqrt{3336/(101\times104)}=0.56$. It follows that \textit{a posteriori} \[ \frac{\mu-\theta_1}{s_1/\sqrt{n_1}}\sim\t_{\nu_1},\qquad\ \phi\sim S_1\chi_{\nu_1}^{-2}. \] For prior, find 75\% point of $\t_4$ from, e.g., Neave's Table 3.1 as 0.741. For posterior, as degrees of freedom are large, can approximate $\t$ by normal, noting that the 75\% point of the standard normal distribution is 0.6745. Hence a 50\% prior HDR for $\mu$ is $85\pm0.741\sqrt{(350/4\times1)}$, that is $(75.6,94.4)$, while a 50\% posterior HDR for $\mu$ is $88.96\pm0.6745\times0.56$, that is, $(88.58, 89.34)$. \nextq With $p_1(\theta)$ being $\N(0,1)$ we see that $p_1(\theta|x)$ is $\N(2,2)$, and with $p_2(\theta)$ being $\N(1,1)$ we see that $p_2(\theta|x)$ is $\N(3,1)$. As \begin{align*} \int p(x|\theta)p_1(\theta)\dtheta & = \int \frac{1}{2\pi}\exp\{-\half(x-\theta)^2-\half\theta^2\}\dtheta \\ & = \frac{\sqrt{\half}}{\sqrt{2\pi}}\exp\{-\quarter x^2\} \\ & \quad\times\int\frac{1}{\sqrt{2\pi\half}} \exp\{-\half(\theta-\half x)^2/\half\}\dtheta \\ & = \frac{\sqrt{\half}}{\sqrt{2\pi}}\exp\{-\quarter x^2\} =\frac{1}{2\sqrt{\pi}}\exp\{-2\} \end{align*} and similarly \begin{align*} \int p(x|\theta)p_1(\theta)\dtheta & =\int \frac{1}{2\pi}\exp\{-\half(x-\theta)^2-\half(\theta-1)^2\}\dtheta \\ & =\frac{\sqrt{\half}}{\sqrt{2\pi}}\exp\{-\quarter (x-1)^2\} \\ & \quad\times\int\frac{1}{\sqrt{2\pi\half}} \exp\{-\half(\theta-\half (x+1))^2/\half\}\dtheta \\ & =\frac{\sqrt{\half}}{\sqrt{2\pi}}\exp\{-\quarter (x-1)^2\} =\frac{1}{2\sqrt{\pi}}\exp\{-\quarter\} \end{align*} so that just as in Section 2.13 we see that the posterior is an $\alpha'$ to $\beta'$ mixture of $\N(2,1)$ and $\N(3,1)$ where $\alpha'\propto\twothirds\text{e}^{-2}=0.09$ and $\beta'\propto\third\text{e}^{-1/4}=0.26$, so that $\alpha'=0.26$ and $\beta'=0.74$. It follows that \[ \Pr(\theta > 1)=0.26\times0.1587+0.74\times0.0228=0.058. \] \nextq Elementary manipulation gives \begin{align*} & n\mean X^2+n_0\theta_0^2 - (n+n_0)\left(\frac{n\mean X+n_0\theta_0}{n+n_0}\right)^2 \\ & = \frac{1}{n+n_0}[\{n(n+n_0)-n^2\}\mean X^2+ \{n_0(n+n_0)-n_o^2\}\theta_0^2-2(nn_0)\mean X\theta] \\ & =\frac{nn_0}{n+n_0}[\mean X^2+\theta_0^2-2\mean X\theta_0] = (n_0^{-1}+n^{-1})^{-1}(\mean X-\theta)^2. \end{align*} \section {Exercises on Chapter \arabic{section}} \startqs \nextq Using Bayes postulate $p(\pi)=1$ for $0\leqslant\pi\leqslant1$ we get a posterior $(n+1)\pi^n$ which has mean $(n+1)/(n+2)$. \nextq From Table B.5, $\underline F_{\,40,24}=0.55$ and $\overline F_{40,24}=1.87$, so for $\Be(20,12)$ take lower limit $20\times0.55/(12+20\times0.55)=0.48$ and upper limit $20\times1.87/(12+20\times1.87)=0.76$ so 90\% HDR $(0.48.0.76)$. Similarly by interpolation $\underline F_{\,41,25}=0.56$ and $\overline F_{41,25}=1.85$, so for $\Be(20.5,12.5)$ quote $(0.48.0.75)$. Finally by interpolation $\underline F_{\,42,26}=0.56$ and $\overline F_{42,26}=1.83$, so for $\Be(21,13)$ quote $(0.47.0.75)$. It does not seem to make much difference whether we use a $\Be(0,0)$, a $\Be(\half,\half)$ or a $\Be(1,1)$ prior. \nextq Take $\alpha/(\alpha+\beta)=1/3$ so $\beta=2\alpha$ and \[ \Var\pi=\frac{\alpha\beta}{(\alpha+\beta)^2(\alpha+\beta+1)} =\frac{2}{3^2(3\alpha+1)} \] so $\alpha=55/27\cong2$ and $\beta=4$. Posterior is then $\Be(2+8,4+12)$, that is $\Be(10,16)$. The 95\% values for $F_{32,20}$ are 0.45 and 2.30 by interpolation, so those for $F_{20,32}$ are 0.43 and 2.22. An appropriate interval for $\pi$ is from $10\times0.43/(16+10\times0.43$ to $10\times2.22/(16+10\times2.22)$, that is $(0.21,0.58)$. \nextq Take $\alpha/(\alpha+\beta)=0.4$ and $\alpha+\beta=12$, so approximately $\alpha=5$, $\beta=7$. Posterior is then $\Be(5+12,7+13)$, that is, $\Be(17,20)$. \nextq Take beta prior with $\alpha/(\alpha+\beta)=\third$ and $\alpha+\beta=\quarter11=2.75$. It is convenient to take integer $\alpha$ and $\beta$, so take $\alpha=1$ and $\beta=2$, giving $\alpha/(\alpha+\beta)=\third$ and $(\alpha+\beta)/11=0.273$. Variance of prior is $2/36=0.0555$ so standard deviation is 0.236. Data is such that $n=11$ and $X=3$, so posterior is $\Be(4,10)$. From tables, values of $\F$ corresponding to a 50\% HDR for $\log\F_{20,8}$ are $\underline F=0.67$ and $\overline F=1.52$. It follows that the appropriate interval of values of $\F_{8,20}$ is $(1/\overline F, 1/\underline F)$, that is $(0.66, 1.49)$. Consequently and appropriate interval for the proportion $\pi$ required is $4\times0.66/(10+4\times0.66)\leqslant\pi\leqslant4\times1.49/(10+4\times1.49)$, that is $(0.21, 0.37)$. The posterior mean is $4/(4+10)=0.29$, the posterior mode is $(4-1)/(4+10-2)=0.25$ and using the relationship $\text{median}\cong(2\,\text{mean}+\text{mode})/3$ the posterior mode is approximately 0.28. The actual overall proportion $86/433=0.20$ is consistent with the above. [Data from the York A.U.T.\ Membership List for 1990/91.] \nextq By Appendix A.13, $\E x=n(1-\pi)/\pi$ and $\Var x=n(1-\pi)/\pi^2$, so $g'(\E x)=\half n^{-1}[(1-\pi)/\pi^2]^{-\frac{1}{2}}$, so $\Var x=1/4n$ (cf.\ Section 1.5). By analogy with the argument that an arc-sine prior for the binomial distribution is approximately data-translated, this suggests a prior uniform in $\sinh^{-1}\sqrt{\pi}$ so with density $\half\pi^{-\frac{1}{2}}(1+\pi)^{-\frac{1}{2}}$. But note Jeffreys' Rule suggests $\Be(0,\half)$ as remarked in Section 7.4. \nextq As this is a rare event, we use the Poisson distribution. We have $n=280$, $T=\sum x_i=196$. Posterior is $S_1^{-1}\chi_{\nu^{\,\prime}}^2$ where $S_1=S_0+2n$, $\nu^{\,\prime}=\nu+2T$. For reference prior take $\nu=1$, $S_0=0$ we get a posterior which is $560^{-1}\chi_{393}^2$. Using the approximation in Appendix A.3, we find that a 95\% interval is $\half(\sqrt{785} \pm 1.96)^2/560$, that is, $(0.61,0.80)$. \nextq Prior is such that $\nu/S_0=0.66$, $2\nu/S_0^2=0.115^2$, so $\nu=66$, $S_0=100$, so posterior has $S_1=660$, $\nu^{\,\prime}=458$. This gives a posterior 95\% interval $\half(\sqrt{915}\pm1.96)^2/660$, that is, $(0.61,0.79)$. \nextq $\partial p(x|\alpha)/\partial\alpha=3/2\alpha-\half x^2$ and $\partial^2 p(x|\alpha)/\partial\alpha^2=-3/2\alpha^2$, so $I(\alpha|x)=3/2\alpha^2$ and we take $p(\alpha)\propto1/\alpha$ or equivalently a uniform prior in $\psi=\log\alpha$. \nextq We have $\partial^2L/\partial\pi^2=-x/\pi^2-z/(1-\pi-\rho)^2$, etc., so \[ I(\pi,\rho\,|\,x,y,z)=\left(\begin{array}{cc} n/\pi+n/(1-\pi-\rho) & n/(1-\pi-\rho) \\ n/(1-\pi-\rho) & n/\rho+n/(1-\pi-\rho) \end{array}\right) \] and so after a little manipulation $\det I(\pi,\rho\,|\,x,y,z)=n\{\pi\rho(1-\pi-\rho)\}^{-1}$, suggesting a prior \[ p(\pi,\rho)\propto\pi^{-\frac{1}{2}}\rho^{-\frac{1}{2}} (1-\pi-\rho)^{-\frac{1}{2}} \] which is evidently related to the arc-sine distribution. \nextq $\partial p(x|\gamma)/\partial\gamma=1/\gamma+\log\xi-\log x$ and $\partial^2 p(x|\gamma)/\partial\gamma^2=-1/\gamma^2$, so $I(\gamma|x)=1/\gamma^2$ and we take $p(\gamma)\propto1/\gamma$ or equivalently a uniform prior in $\psi=\log\gamma$. \nextq Using Appendix A.16, coefficient of variation is $\sqrt{}\{2/(\gamma+1)(\gamma-2)\}$. This is less than 0.01 if $\half(\gamma+1)(\gamma-2)>1/0.01^2$ or $\gamma^2-\gamma-20,002 > 0$, so if $\gamma > \half(1+\sqrt{80,009})=141.9$. Taking the reference prior $p(\alpha,\beta)\propto(\beta-\alpha)^{-2}$, that is, $\Pabb(-\infty,\infty,-1)$ (cf.\ Section 3.6), we need $\gamma'=n-1>141.9$, that is, $n$ at least 143. \nextq Take prior $p(\theta)=(d-1)\theta^{-d}$ for $\theta_0 < \theta < \infty$. Then posterior is $p(\theta|\vect x)=(d'-1)\theta^{-d'}$ for $\theta_1 < \theta < \infty$ where $d'=d+n(c+1)$ and $\theta_1=\max\{\theta_0,M\}$ where $M=\max\{x_i\}$. \nextq Prior $p(\nu)\propto1/\nu$ as before, but likelihood is now $p(71,100\,|\,\nu)=1/\nu^2$ for $\nu\geqslant100$, so posterior is approximately \[ p(\nu\,|\,71,100)\propto \nu^{-3}/\left(\sum_{\mu\geqslant100}\mu^{-3}\right). \] Approximating sums by integrals, the posterior probability $\Pr(\nu\geqslant \nu_0\,|\,71,100)=100^2/\nu_0^2$, and in particular the posterior median is $100\sqrt{2}$ or about 140. \nextq We have $p(\theta)=1/\theta$ and setting $\psi=\log\theta$ we get $p(\psi)=p(\theta)\,|\!\dtheta/\!\dpsi|=1$. Thus we expect all pages to be more or less equally dirty. \nextq The group operation is $(x,y)\mapsto x+y\mod2\pi$ and the appropriate Haar measure is Lebesgue measure on the circle, that is, arc length around the circle. \nextq Table of $c^2\{c^2+(x_1-\mu)^2\}^{-1}\{c^2+(x_2-\mu)^2\}^{-1}$: \[ \begin{array}{cccccc} \mu\backslash c \ & 1 & 2 & 3 & 4 & 4 \\ 0 & 0.00 & 0.01 & 0.01 & 0.01 & 0.01 \\ 2 & 0.06 & 0.05 & 0.04 & 0.03 & 0.02 \\ 4 & 0.04 & 0.06 & 0.05 & 0.04 & 0.03 \\ 6 & 0.06 & 0.05 & 0.04 & 0.03 & 0.02 \\ 8 & 0.00 & 0.01 & 0.01 & 0.01 & 0.01 \end{array} \] Integrating over $c$ using Simpson's Rule (and ignoring the constant) for the above values of $\mu$ we get 0.11, 0.48, 0.57, 0.48 and 0.11 respectively. Integrating again we get for intervals as shown: \[ \begin{array}{cccccc} (-1,1) & (1,3) & (3,5) & (5,7) & (7,9) & \text{Total} \\ 0.92 & 2.60 & 3.24 & 2.60 & 0.92 & 10.28 \end{array} \] so the required posterior probability is $3.24/10.28=0.31$. \nextq First part follows as sum of concave functions is concave and a concave function has a unique maximum. For the example, note that \begin{align*} p(x|\theta) & =\exp(\theta-x)/\{1+\exp(\theta-x)\}^2 \ \ (-\infty < x < \infty) \\ & =\quarter\sech^2\half(\theta-x)\ \ (-\infty < x < \infty) \end{align*} (which is symmetrical about $x=\theta$), so that the log-likelihood is \[ L(\theta|x)=\theta-x-2\log\{1+\exp(\theta-x)\}. \] Hence \begin{align*} L'(\theta|x) & = 1 - 2\exp(\theta-x)/\{1+\exp(\theta-x)\} \\ & = \{1-\exp(\theta-x)\}/\{1+\exp(\theta-x)\} \\ & = 1 - 2/\{1+\exp(x-\theta)\} \\ L''(\theta|x) & =-2\exp(x-\theta)/\{1+\exp(x-\theta)\}^2 \\ & =-2\exp(\theta-x)/\{1+\exp(\theta-x)\}^2. \end{align*} As this is clearly always negative, the log-likelihood is concave. Also \begin{align*} L'(\theta|x)/L''(\theta|x) & =\half\{\exp(\theta-x)-\exp(x-\theta)\} \\ I(\theta|x) & = 2\int_{-\infty}^{\infty} \exp(2(\theta-x))/\{1+\exp(\theta-x)\}^4\dx \\ & = (1/8) \int_{-\infty}^{\infty} \sech^4(\theta-x) \dx \\ & = (1/8) \int_{-\infty}^{\infty} \sech^4 y \dy \\ & = (1/24) [\sinh y \sech^3 y + 2 \tanh y]_{-\infty}^{\infty}=1/6. \end{align*} (The integration can be checked by differentiation of the result). Now proceed as in Section 3.10. \section {Exercises on Chapter \arabic{section}} \startqs \nextq $1-(1-p_0)=p_0=\left[1+(1-\pi_0)\pi_0^{-1}B^{-1}\right]^{-1} =1-(1-\pi_0)\pi_0^{-1}B^{-1}+(1-\pi_0)^2\pi_0^{-2}B^{-2}$. Result then follows on noting $\pi_0^{-1}=\left\{1-(1-\pi_0)\right\}^{-1} \cong1+(1-\pi_0)$. \nextq Substituting in the formulae in the text we get \begin{align*} \phi_1 & = (0.9^{-2}+1.8^{-2})^{-1}=0.648=0.80^2; \\ \theta_1 & = 0.648(93.3/0.9^2+93.0/1.8^2)=93.2; \\ \pi_o & = \Phi((93.0-93.3)/0.9)=\Phi(-0.333)=0.3696;\ \ \pi_0/(1-\pi_0)=0.59; \\ p_0 & = \Phi((93.0-93.2)/0.8)=\Phi(-0.25)=0.4013;\ \ p_0/(1-p_0)=0.67; \\ B & = 0.67/0.59=1.14. \end{align*} \nextq $n=12$, $\mean x=118.58$, $S=12969$, $s/\sqrt{n}=9.91$, $(\mean x-100)/(s/\sqrt{n})=1.875$. Taking a normal approximation this gives a $P$-value of $1-\Phi(1.875)=0.0303$. \nextq $n=300$, $n\pi=900/16=56.25$, $n\pi(1-\pi)=45.70$, $(56-56.25)/\sqrt{45.70}=-0.037$, so certainly not significant. \nextq Posterior with reference prior is $S_1^{-1}\chi_{\nu^{\,\prime}}^2$ with $S_1=12$ and $\nu^{\,\prime}=31$. Values of $\chi^2$ corresponding to a 90\% HDR for $\log\chi_{31}^2$ are (by interpolation) 19.741 and 45.898, so a 90\% posterior HDR is from 1.65 to 3.82 which includes 3. So not appropriate to reject null hypothesis. \nextq If $k=0.048$ then $\exp(2k)=1.101=1/0.908$, so take $\theta$ within $\pm\varepsilon=k\sqrt{(\phi/n)}\z=0.048\sqrt{(\phi/10)}/2.5=0.006\sqrt{\phi}$. \nextq Using standard power series expansions \begin{align*} B & = (1+1/\lambda)^{\frac{1}{2}}\exp[-\half(1+\lambda)^{-1}] \\ & = \lambda^{-\frac{1}{2}}(1+\lambda)^{\frac{1}{2}}\exp(-\half z^2) \exp[\half\lambda z^2(1+\lambda)^{-1}] \\ & = \lambda^{\frac{1}{2}}(1+\half\lambda+\dots)\exp(-\half z^2) [1+\half z^2(1+\lambda)^{-1}+\dots] \\ & = \lambda^{\frac{1}{2}}\exp(-\half z^2)(1+\half\lambda(z^2+1)+\dots). \end{align*} \nextq Likelihood ratio is \begin{align*} \frac {\{2\pi(\phi+\varepsilon)\}^{-n/2}\exp[-\half\sum x_i^2/(\phi+\varepsilon)]} {\{2\pi(\phi-\varepsilon)\}^{-n/2}\exp[-\half\sum x_i^2/(\phi-\varepsilon)]} & \cong\left(1+\frac{\varepsilon}{\phi}\right)^{-n} \exp\left[\varepsilon\sum x_i^2/\phi^2\right] \\ & \cong\exp\left[\varepsilon\left(\sum x_i^2-n\phi\right)/\phi^2\right] \\ & \cong\exp\left[\frac{n\varepsilon}{\phi} \left(\frac{\sum x_i^2/n}{\phi}-1\right)\right]. \end{align*} \nextq $\partial p_1(\mean x)/\partial\psi=\{-\half(\psi+\phi/n)^{-1}+ \half(\mean x-\theta)^2/(\psi+\phi/n)^2\}p_1(\mean x)$ which vanishes if $\psi+\phi/n=(\mean x-\theta)^2=z^2\phi/n$ so if $\psi=(z^2-1)\phi/n$. Hence $p_1(\mean x)\leqslant(2\pi\phi/n)^{-\frac{1}{2}}z^{-1}\exp(-\half)$. Consequently $B=(2\pi\phi/n)^{-\frac{1}{2}}\exp(-\half z^2)/p_1(\mean x) \geqslant\sqrt{\text{e}}\,z\exp(-\half z^2)$. For last part use $p_0=\left[1+(\pi_1/\pi_0)B^{-1}\right]$. \nextq Posterior probability is a minimum when $B$ is a minimum, hence when $2\log B$ is a minimum, and \[ \d\,(2\log B)/\d n= \{1+n\}^{-1}+z^2\{1+1/n\}^{-2}(-1/n^2) \] which vanishes when $n=z^2-1$. Since $z^2-1$ is not necessarily an integer, this answer is only approximate. \nextq Test against $\B(7324,0.75)$ with mean 5493 and variance 1373, so $z=(5474-5493)/\sqrt{1373}=-0.51$. Not significant so theory confirmed. \nextq Likelihood ratio is \begin{align*} & \frac{(2\pi2\phi)^{-\frac{1}{2}}\exp[-\half u^2/(2\phi)] (2\pi\psi)^{-\frac{1}{2}}\exp[-\half(z-\mu)/\psi]} {(2\pi2\psi)^{-\frac{1}{2}}\exp[-\half u^2/(2\psi)] (2\pi\psi/2)^{-\frac{1}{2}}\exp[-\half(z-\mu)/(\psi/2)]} \\ & = (\psi/2\phi)^{\frac{1}{2}} \exp[-\half u^2(1//2\phi-1/2\psi)+\half(z-\mu)^2/\psi] \\ & \cong (\psi/2\phi)^{\frac{1}{2}} \exp[-\half u^2/(2\phi)+\half(z-\mu)^2/\psi] \end{align*} With $\sqrt{(\psi/\phi)}=100$, $u=2\times\sqrt{(2\phi)}$ and $z=\mu$, we get $B=\left(100/\sqrt{2}\right)\exp(-2)=9.57$, although 2 standard deviation is beyond 1.96, the 5\% level. \nextq $p_1(\mean x)=\int \rho_1(\theta) p(\mean x|\theta)\dtheta$ which is $1/\tau$ times the integral between $\mu+\tau/2$ and $\mu-\tau/2$ of an $\N(\theta,\phi/n)$ density and so as $\tau\gg\phi/n$ is nearly the whole integral. Hence $p_1(\mean x)\cong1/\tau$, from which it easily follows that $B=(2\pi\phi/n\tau^2)^{-\frac{1}{2}}\exp(-\half z^2)$. In Section 4.5 we found $B=(1+n\psi/\phi)^{\frac{1}{2}}\exp[-\half z^2(1+\phi/n\psi)^{-1}]$, which for large $n$ is about $B=(\phi/n\psi)^{-\frac{1}{2}}\exp(-\half z^2)$. This agrees with the first form found here if $\tau^2=2\pi\phi$. As the variance of a uniform distribution on $(\mu-\tau/2,\mu+\tau/2)$ is $\tau^2/12$, this may be better expressed as $\tau^2/12=(\pi/6)\phi=0.52\phi$. \nextq Jeffreys considers the case where both the mean $\theta$ and the variance $\phi$ are unknown, and wishes to test $\Hyp_0:\theta=0$ versus $\Hyp_1: \theta\neq0$ using the conventional choice of prior odds $\pi_0/\pi_1=1$, so that $B=p_0/p_1$. Under both hypotheses he uses the standard conventional prior for $\phi$, namely $p(\phi)\propto1/\phi$. He assumes that the prior for $\theta$ is dependent on $\sigma=\sqrt{\phi}$ as a scale factor. Thus if $\gamma=\theta/\sigma$ he assumes that $\pi(\gamma,\theta)=p(\gamma)p(\theta)$ so that $\gamma$ and $\theta$ are independent. He then assumes that \blankline(i) if the sample size $n=1$ then $B=1$, and \blankline(ii) if the sample size $n\geqslant2$ and $S=\sum(X_i-\mean X)^2=0$, then $B=0$, that is, $p_1=1$. \blankline From (i) he deduces that $p(\gamma)$ must be an even function with integral 1, while he shows that (ii) is equivalent to the condition \[ \int_0^{\infty} p(\gamma)\gamma^{n-2}\dgamma=\infty. \] He then shows that the simplest function satisfying these conditions is $p(\gamma)=\pi^{-1}(1+\gamma^2)^{-1}$ from which it follows that $p(\theta)=p(\gamma)|\!\dgamma/\!\dtheta|= \pi^{-1}\sigma(\sigma^2+\theta^2)^{-1}$. Putting $\sigma=\sqrt{\phi}$ and generalizing to the case where $\Hyp_0$ is that $\theta=\theta_0$ we get the distribution in the question. There is some arbitrariness in Jeffreys' ``simplest function''; if instead he had taken $p(\gamma)=\pi^{-1}\kappa(\kappa^2+\gamma^2)^{-1}$ he would have ended up with $p(\theta)=\pi^{-1}\tau(\tau^2+(\theta-\theta_0)^2)^{-1}$ where $\tau=\kappa\sigma$. However, even after this generalization, the argument is not overwhelmingly convincing. \nextq \!\!(a)\quad Maximum likelihood estimator $\est\theta$ of $\theta$ is $x/n$, so \[ B\geqslant\left(\frac{\theta_0}{\est\theta}\right)^x \left(\frac{1-\theta_0}{1-\est\theta}\right)^{n-x};\qquad\ p_0\geqslant\left[1+\frac{1-\pi_0}{\pi_0} \left(\frac{\est\theta}{\theta_0}\right)^x \left(\frac{1-\est\theta}{1-\theta_0}\right)^{n-x}\right]^{-1}. \] \blankline (b)\quad From tables (e.g.\ D.~V.~Lindley and W.~F.~Scott, \textit{New Cambridge Elementary Statistical Tables}, Cambridge: University Press 1995 [1st edn (1984)], Table 1, or H.~R.~Neave, \textit{Statistics Tables for mathematicians, engineers, economists and the behavioural and management sciences}, London: George Allen \& Unwin (1978), Table 1.1) the probability of a binomial observation $\leqslant14$ from $\B(20,0.5)$ is 0.9793, so the appropriate (two-tailed) $P$-value is $2(1-0.9793)=0.0414\cong1/24$. The maximum likelihood estimate is $\est\theta=15/20=0.75$, so the lower bound on $B$ is $(0.5/0.75)^{15}(0.5/0.25)^5=2^{20}/3^{15}=0.0731$, implying a lower bound on $p_0$ of 0.0681 or just over 1/15. \nextq \!\!(a)\quad $n=12$, $\nu=11$, $t=\mean x/(s/\sqrt{n})=1.2/\sqrt{(1.1/12)}=3.96$ and if we take $k=1$ the Bayes factor $B$ is \[ \frac{(1+t^2/\nu)^{-(\nu+1)/2}} {(1+nk)^{-\frac{1}{2}}(1+t^2(1+nk)^{-1}/\nu)^{-(\nu+1)/2}} =\frac{0.004910}{(0.2774)(0.5356)} =0.033. \] \blankline (b)\quad $z=\mean x/(s/\sqrt{n})=1.2/\sqrt{12}=4.16$ and (taking $\psi=\phi$ as usual) \begin{align*} B & = (1+n)^{\frac{1}{2}}\exp\left[-\half z^2(1+1/n)^{-1}\right] \\ & = (1+12)^{\frac{1}{2}}\exp\left[-\half(4.16)^2(1+1/12)^{-1}\right]=0.001. \end{align*} \nextq Two-tailed $P$-value is 0.0432 (cf.\ D.~V.~Lindley and W.~F.~Scott, \textit{New Cambridge Elementary Statistical Tables}, Cambridge: University Press 1995 [1st edn (1984)], Table 9), while Bayes factor is \[ \frac{(1+t^2/\nu)^{-(\nu+1)/2}} {(1+nk)^{-\frac{1}{2}}(1+t^2(1+nk)^{-1}/\nu)^{-(\nu+1)/2}} =\frac{0.08712}{(0.3162)(0.7313)} =0.377 \] so $F=1/B=2.65$. Range $(1/30P,3/10P)$ is $(0.77, 6.94)$, so $F$ \textit{is} inside it and we do not need to ``think again''. \nextq For $P$-value 0.1 think again if $F$ not in $(\third,3)$. As $p_0=[1+F]^{-1}$ this means if $p_0$ not in $(0.250,0.750)$, so if $n=1000$. Similarly if $P$-value 0.05, if $p_0$ not in $(0.143,0.600)$, so if $n=1000$ (and the case $n=100$ is marginal); if $P$-value 0.01, if $p_0$ not in $(0.032,0.231)$, so if $n=100$ or $n=1000$; if $P$-value 0.001, if $p_0$ not in $(0.003,0.029)$, so if $n=50$, $n=100$ or $n=1000$. \section {Exercises on Chapter \arabic{section}} \startqs \nextq Mean difference $\mean w=0.05\dot{3}$, $S=0.0498$, $s=0.0789$. Assuming a standard reference prior for $\theta$ and a variance known equal to $0.0789^2$, the posterior distribution of the effect $\theta$ of Analyst $A$ over Analyst $B$ is $\N(0.05\dot{3},0.0789^2/9)$ leading to a 90\% interval $0.05\dot{3}\pm1.6449\times0.0789/3$, that is, $(0.010,0.097)$. If the variance is not assumed known then the normal distribution should be replaced by $\t_8$. \nextq If variance assumed known, $z=0.05\dot{3}/(0.0789/3)=2.028$ so with $\psi=\phi$ \begin{align*} B & =(1+n)^{\frac{1}{2}}\exp[-\half z^2(1+1/n)^{-1}] \\ & =(1+9)^{\frac{1}{2}}\exp[-\half(2.028)^2(1+\ninth)^{-1}] =0.497 \end{align*} and $p_0=\left[1+0.497^{-1}\right]^{-1}=0.33$ with the usual assumptions. If the variance is not assumed known, \[ B=\frac{\{1+2.028^2/8\}^{-9/2}} {10^{-\frac{1}{2}}\{1+2.028^2{10}^{-1}/8\}^{-9/2}} =\frac{0.1546}{(0.3162)(0.7980)}=0.613 \] and $p_0=\left[1+0.613^{-1}\right]^{-1}=0.38$. \nextq We know $\sum(x_i+y_i)\sim\N(2\theta,2\sum\phi_i)$ and $\sum(x_i-y_i)$ is independently $\N(0,2\sum\phi_i)$, so $\sum(x_i-y_i)^2/2\sum\phi_i\sim\chi_n^2$ and hence if $\theta=0$ then \[ \sum(x_i+y_i)/\sqrt{\sum(x_i-y_i)^2}\sim\t_n. \] Hence test whether $\theta=0$. If $\theta\ne0$, it can be estimated as $\half\sum(x_i+y_i)$. \nextq In that case \begin{align*} B & = (1+440.5/99)^{\frac{1}{2}}\exp[-\half(1.91)^2\{1+99/440.5\}^{-1}] \\ & = 5.4495^{\frac{1}{2}}\exp(-1.4893)=0.53. \end{align*} If the prior probability of the null hypothesis is taken as $\pi_0=\half$, then this gives a posterior probability of $p_0=(1+0.53^{-1})^{-1}=0.35$. \nextq $m=9$, $n=12$, $\mean x=12.42$, $\mean y=12.27$, $s_x=0.1054$, $s_y=0.0989$. \blankline (a)\quad Posterior of $\delta$ is $\N(12.42-12.27,0.1^2(1/9+1/12))$ so 90\% interval is $0.15\pm1.6449\times\sqrt{0.00194}$, that is, $(0.077,0.223)$. \blankline (b)\quad $S=8\times0.1054^2+11\times0.0989^2=0.1965$, $s=\sqrt{0.1965/19}=0.102$, so $s\sqrt{\phantom{|}}(m^{-1}+n^{-1})=0.045$, so from tables of $\t_{19}$ a 90\% interval is $0.15\pm1.729\times0.045$, that is $(0.072,0.228)$. \nextq With independent priors uniform in $\lambda$, $\mu$ and $\log\phi$, that is, $p(\lambda,\mu,\phi)\propto1/\phi$, \begin{align*} p(\lambda,\mu,\phi\, & |\,\vect x,\vect y) \propto p(\lambda,\mu,\phi)\, p(\vect x|\lambda,\phi)\,p(\vect y|\mu,\phi) \\ & \propto (1/\phi)(2\pi\phi)^{-(m+n)/2} \exp\left[-\half \left\{\sum(x_i-\lambda)^2+\half\sum(y_i-\mu)^2\right\}/\phi\right] \\ & \propto \phi^{-(m+n)/2-1} \exp[-\half\{S_x+m(\mean x-\lambda)^2+ \half S_y+\half n(\mean y-\mu)^2\}/\phi] \end{align*} Writing $S=S_x+\half S_y$ and $\nu=m+n-2$ we get \begin{align*} p(\lambda,\mu,\phi|\vect x,\vect y) & \propto \phi^{-\nu/2-1} \exp[-\half S/\phi]\,\, (2\pi\phi/m)^{-\frac{1}{2}}\exp[-\half m(\lambda-\mean x)^2)/2\phi] \\ & \qquad\times(2\pi2\phi/m)^{-\frac{1}{2}} \exp[-\half m(\mu-\mean y)^2)/2\phi] \\ & \propto p(\phi|S)\, p(\lambda|\phi,\mean x)\, p(\mu|\phi,\mean y) \end{align*} where \[ \begin{array}{ll} p(\phi\,|\,S) & \text{is an $S\chi_{\nu}^{-2}$ density,} \\ p(\lambda\,|\,\phi,\mean x) & \text{is an $\N(\mean x,\phi/m)$ density,} \\ p(\mu\,|\,\phi,\mean x) & \text{is an $\N(\mean y,2\phi/m)$ density,} \end{array} \] It follows that, for given $\phi$, the parameters $\lambda$ and $\mu$ have independent normal distributions, and so that the joint density of $\delta=\lambda-\mu$ and $\phi$ is \[ p(\delta,\phi|\vect x,\vect y) = p(\phi|S)\, p(\delta|\mean x-\mean y, \phi) \] where $p(\delta\,|\,\mean x-\mean y, \phi)$ is an $\N(\mean x-\mean y, \ \phi(m^{-1}+2n^{-1})$ density. This variance can now be integrated out just as in Sections 2.12 and 5.2, giving a very similar conclusion, that is, that if \[ t=\frac{\delta-(\mean x-\mean y)}{s\{m^{-1}+2n^{-1}\}^{\frac{1}{2}}} \] where $s^2=S/\nu$, then $t\sim\t_{\nu}$. \nextq We find that \[ S_1=S_0+S_x+S_y+m_0\lambda_0^2+m\mean x^2+ n_0\mu_0^2+n\mean y^2-m_1\lambda_1^2-n_1\mu_1^2 \] and then proceed as in Exercise 18 on Chapter 2 to show that \[ S_1=S_0+S_x+S_y+(m_0^{-1}+m^{-1})^{-1}(\mean x-\lambda_0)^2+ (n_0^{-1}+n^{-1})^{-1}(\mean y-\mu_0)^2. \] \nextq $m=10$, $n=9$, $\mean x=22.2$, $\mean y=23.1$, $s_x=1.253$, $s_y=0.650$. Consequently $\sqrt{\phantom{|}}(s_x^2/m+s_y^2/n)=0.452$ and $\tan\theta==(s_y/\sqrt{n})/(s_x/\sqrt{m})=0.547$, so $\theta\cong30^{\circ}$. Interpolating in Table B.1 with $\nu_1=8$ and $\nu_2=9$ the 90\% point of the Behrens' distribution is 1.42, so a 90\% HDR is $22.2-23.1\pm1.42\times0.452$, that is, $(-1.54,-0.26)$. \nextq Evidently $f_1=(m-1)/(m-3)$ and \[ f_2=\frac{(m-1)^2}{(m-3)^2(m-5)}(\sin^4\theta+\cos^4\theta). \] Also $1=(\sin^2\theta+\cos^2\theta)^2=\sin^4\theta+\cos^4\theta+ 2\sin^2\theta\cos^2\theta$, so that $\sin^4\theta+\cos^4\theta= 1-\sin^2 2\theta=\cos^2 2\theta$. The result follows. \nextq As $T_x\sim\t_{\nu(x)}$ and $T_y\sim\t_{\nu(y)}$ we have \[ p(T_x,T_y|\vect x,\vect y)\propto (1+T_x^2/\nu(x))^{-(\nu(x)+1)/2} (1+T_y^2/\nu(y))^{-(\nu(y)+1)/2} \] Jacobian is trivial, and the result follows (cf.\ Appendix A.18). \nextq Set $y=\nu_1x/(\nu_2+\nu_1x)$ so that $1-y=1/(\nu_2+\nu_1x)$ and $\!\dy/\!\dx=-\nu_1/(\nu_2+\nu_1x)^2$. Then using Appendix A.19 for the density of $x$ \[ p(y)=p(x)\left|\frac{\!\dx}{\!\dy}\right| \propto \frac{x^{\nu_1/2-1}}{(\nu_2+\nu_1x)^{(\nu_1+\nu_2)/2-2}} \propto y^{\nu_1/2-1}(1-y)^{\nu_2/2-1} \] from which the result follows. \nextq $k=s_1^2/s_2^2=6.77$, so $\eta^{-1}=k/\kappa=6.77/\kappa\sim \F_{24,14}$. A 95\% interval for $\F_{24,14}$ from Table B.5 is $(0.40,2.73)$ so one for $\kappa$ is $(2.5,16.9)$. \nextq $k=3$ with $\nu_x=\nu_y=9$. A an interval corresponding to a 99\% HDR for $\log\F$ for $\F_{9,9}$ is $(0.15,6.54)$, so a 99\% interval for $\sqrt{\kappa}$ is from $\sqrt{3\times0.15}$ to $\sqrt{3\times6.54}$, that is, $(0.67,4.43)$. \nextq Take $\alpha_0+\beta_0=6$, $\alpha_0/(\alpha_0+\beta_0)=\half$, $\gamma_0+\delta_0=6$, $\gamma_0/(\gamma_0+\delta_0)=\twothirds$, so $\alpha_0=3$, $\beta_0=3$, $\gamma_0=4$, $\delta_0=2$ and hence $\alpha=3+a=11$, $\beta=3+b=15$, $\gamma=4+c=52$, $\delta=2+d=64$ so that \[ \log\{(\alpha-\half)(\delta-\half)/(\beta-\half)(\gamma-\half)\} =\log0.983=-0.113 \] while $\alpha^{-1}+\beta^{-1}+\gamma^{-1}+\delta^{-1}=0.192$. Hence posterior of log-odds ratio is $\Lambda-\Lambda'\sim\N(-0.113,0.192)$. The posterior probability that $\pi > \rho$ is \[ \Phi(-0.113/\sqrt{0.192})=\Phi(-0.257)=0.3986. \] \nextq Cross-ratio is $(45\times29)/(28\times30)=1.554$ so its logarithm is 0.441. More accurately, adjusted value is $(44.5\times28.5)/(27.5\times29.5)=1.563$ so its logarithm is 0.446. Value of $a^{-1}+b^{-1}+c^{-1}+d^{-1}$ is 0.126, and so the posterior distribution of the log-odds ratio is $\Lambda-\Lambda'\sim\N(0.441,0.126)$ or more accurately $\N(0.446, 0.126)$. The posterior probability that the log-odds ratio is positive is $\Phi(0.441/\sqrt{0.126})=\Phi(1.242)=0.8929$ or more accurately $\Phi(0.446/\sqrt{0.126})=\Phi(1.256)=0.8955$. With the same data $\sin^{-1}\sqrt{(45/73)}=0.903$ radians and $\sin^{-1}\sqrt{(30/59)}=0.794$ radians, and $1/4m+1/4n=0.00766$, so the posterior probability that $\pi > \rho$ is $\Phi((0.903-0.794)/\sqrt{0.00766})=\Phi(1.245)=0.8934$. \nextq $\Lambda-\Lambda'=\log(\pi/\rho)-\log\{(1-\pi)/(1-\rho)\}$ so if $\pi-\rho=\alpha$ we get \begin{align*} \Lambda-\Lambda' &=\log\{\pi/(\pi-\alpha)\}-\log\{(1-\pi)/(1-\pi+\alpha)\} \\ &=-\log\{1-\alpha)/\pi\}+\log\{1+\alpha/(1-\pi)\} \\ &\cong\alpha/\pi+\alpha/(1-\pi)=\alpha/\{\pi(1-\pi)\}. \end{align*} \nextq With conventional priors, posteriors are near $\pi\sim\Be(56,252)$ and $\rho\sim\Be(34,212)$, so approximating by normals of the same means and variances $\pi\sim\N(0.1818,0.0004814)$, $\rho\sim\N(0.1382,0.0004822)$, so $\pi-\rho\sim\N(0.0436,0.000963)$ so $\Pr(\pi-\rho > 0.01)=\Phi((0.0436-0.01)/\sqrt{0.000963})= \Phi(1.083)=0.8606$ and so the posterior odds are $0.8606/(1-0.8606)=6.174$. \nextq Using a normal approximation, $x\sim\N(8.5,8.5)$ and $y\sim\N(11.0,11.0)$, so that $x-y\sim\N(-2.5,19.5)$. \section {Exercises on Chapter \arabic{section}} \startqs \nextq Straightforward substitution gives {\catcode`?=\active \def?{\kern\digitwidth} \[ \begin{array}{lllllll} \text{Sample} & ?1 & ?2 & ?3 & ?4 & ?5 &\ \text{Total} \\ n & 12 & 45 & 23 & 19 & 30 &\ 129 \\ r & ?0.631 & ?0.712 & ?0.445 & ?0.696 & ?0.535 \\ \tanh^{-1}z & ?0.743 & ?0.891 & ?0.478 & ?0.860 & ?0.597 \\ n\tanh^{-1}z & ?8.916 & 40.095 & 10.994 & 16.340 & 17.910 &\ ?94.255 \end{array} \] } Posterior for $\zeta$ is $\N(94.255/129,1/129)$ and 95\% HDR is $0.7307\pm1.96\times0.0880$, that is, $(0.5582,0.9032)$. A corresponding interval for $\rho$ is $(0.507,0.718)$. \nextq Another straightforward substitution gives {\catcode`?=\active \def?{\kern\digitwidth} \[ \begin{array}{lllllll} n & 45 & 34 & 49 \\ 1/n & ?0.0222 & ?0.0294 & ?0.0204 \\ r & ?0.489 & ?0.545 & ?0.601 \\ \tanh^{-1}z & ?0.535 & ?0.611 & ?0.695 \end{array} \] } Hence $\zeta_1-\zeta_2\sim\N(0.535-0.611,\ 0.0222+0.0294)$, that is, $\N(-0.076,0.227^2)$. Similarely $\zeta_2-\zeta_3\sim\N(-0.084,0.223^2)$ and $\zeta_3-\zeta_1\sim\N(0.160,0.206^2)$. It follows without detailed examination that there is no evidence of difference. \nextq $\zeta\sim\N\left(\sum n_i\tanh^{-1}r_i)/\sum n_i\,,\,1/\sum n_i\right)$ so required interval is \[ \frac{\sum n_i\tanh^{-1}r_i}{\sum n_i}\pm\frac{1.96}{\sqrt{\sum n_i}}. \] \nextq We found in Section 6.1 that \[ p(\rho\,|\,\vect x,\vect y)\propto p(\rho)\frac{(1-\rho^2)^{(n-1)/2}} {(1-\rho r)^{n-(3/2)}} \] As $p(\rho)(1-\rho^2)^{-\frac{1}{2}}(1-\rho r)^{3/2}$ does not depend on $n$, for large $n$ \[ p(\rho\,|\,\vect x,\vect y)\propto\frac{(1-\rho^2)^{n/2}} {(1-\rho r)^n} \] so that $\log l(\rho\,|\,\vect x,\vect y)=c+\half n\log(1-\rho^2)-n\log(1-\rho r)$, and hence \[ (\partial/\partial\rho)\log l(\rho\,|\,\vect x,\vect y)= -n\rho(1-\rho^2)^{-1}+nr(1-\rho r)^{-1}. \] implying that the maximum likelihood estimator $\est\rho\cong r$. Further \[ (\partial^2/\partial\rho^2)\log l(\rho\,|\,\vect x,\vect y)= -n(1-\rho^2)^{-1}-2n\rho^2(1-\rho^2)^{-2}+nr^2(1-\rho r)^{-2}, \] so that if $\rho=r$ we have $(\partial^2/\partial\rho^2)\log l(\rho\,|\,\vect x,\vect y)= -n(1-\rho^2)^{-2}$. This implies that the information should be about $I(\rho\,|\,\vect x,\vect y)=n(1-\rho^2)^{-2}$, and so leads to a prior $p(\rho)\propto(1-\rho^2)^{-1}$. \nextq We have \[ p(\rho|\vect x,\vect y)\propto p(\rho)(1-\rho^2)^{(n-1)/2} \int_0^{\infty} (\cosh t-\rho r)^{-(n-1)}\dt \] Now write $-\rho r=\cos\theta$ so that \[ p(\rho|\vect x,\vect y)\propto p(\rho)(1-\rho^2)^{(n-1)/2} \int_0^{\infty} (\cosh t+\cos\theta)^{-(n-1)}\dt \] and set \[ I_k=\int_0^{\infty} (\cosh t + \cos\theta)^{-k} \dt \] We know that $I_1=\theta/\sin\theta$ (cf.\ J.~A.~Edwards, \textit{A Treatise on the Integral Calcuous} (2 vols), London: Macmillan (1921) [reprinted New York: Chelsea (1955)], art.\ 180). Moreover \[ (\partial/\partial\theta)(\cosh t+\cos\theta)^{-k}= k\sin\theta(\cosh t+\cos\theta)^{-(k+1)} \] and by induction \[ (\partial/\sin\theta\partial\theta)^k(\cosh t+\cos\theta)^{-1}= k!(\cosh t+\cos\theta)^{-(k+1)} \] Differentiating under the integral sign, we conclude that \[ (\partial/\sin\theta\partial\theta)^kI_1=k!I_{k+1}\qquad\ (k\geqslant0). \] Taking $k=n-2$, or $k+1=n-1$, we get \[ I_k=\int_0^{\infty} (\cosh t + \cos\theta)^{-k} \dt\propto (\partial/\sin\theta\partial\theta)^k(\theta/\sin\theta). \] (ignoring the factorial). Consequently \[ p(\rho|\vect x,\vect y)\propto p(\rho)(1-\rho^2)^{(n-1)/2} (\partial/\sin\theta\partial\theta)^k(\theta/\sin\theta). \] Since $\d(r\rho)/\!\dtheta=\d(-\cos\theta)/\!\dtheta=\sin\theta$ and so $\partial/\sin\theta\partial\theta=\d/\d(r\rho)$, we could alternatively write \[ p(\rho|\vect x,\vect y)\propto p(\rho)(1-\rho^2)^{(n-1)/2} \frac{\d^{n-2}}{\d(\rho r)^{n-2}} \left(\frac{\text{arc\ }\cos(-\rho r)}{(1-\rho^2r^2)^{\frac{1}{2}}}\right) \] \nextq Supposing that the prior is \[ p(\rho) \propto (1-\rho^2)^{k/2} \] and $r=0$ then \[ p(\rho|\vect x,\vect y) \propto p(\rho)(1-\rho^2)^{(n-1)/2} \] so with the prior as in the question we get \[ p(\rho|\vect x,\vect y) \propto (1-\rho^2)^{(k+n-1)/2} \] If we define \[ t=\sqrt{(k+n+1)}\frac{\rho}{\sqrt{(1-\rho^2)}},\qquad \rho=\frac{t}{\sqrt{\{(k+n+1)+t^2\}}} \] so that \begin{gather*} 1-\rho^2=\frac{(k+n+1)}{(k+n+1)+t^2},\qquad 2\rho\frac{\drho}{\dt}=\frac{(k+n+1)2t}{\{(k+n+1)+t^2\}^2} \\ \frac{\drho}{\dt}=\frac{(k+n+1)}{\{(k+n+1)+t^2\}^{3/2}} \end{gather*} and hence \begin{align*} p(t|\vect X,\vect Y) & \propto p(\rho|\vect X,\vect Y)\drho/\dt \\ & \propto\left[\frac{(k+n+1)}{(k+n-1)+t^2}\right]^{(k+n-1)/2} \frac{(k+n+1)}{\{(k+n+1)+t^2\}^{3/2}} \\ & \propto\left[1+\frac{t^2}{k+n+1}\right]^{-(k+n+2)/2} \end{align*} This can be recognized as Student's $\t$ distribution on $k+n+1$ degrees of freedom (see Section 2.12). \nextq See G.~E.~P.~Box and G.~C.~Tiao, \textit{Bayesian Inference in Statistical Analysis}, Reading, MA: Addison-Wesley (1973, Section 8.5.4---the equation given in the question is implicit in their equation (8.5.49)). \nextq See G.~E.~P.~Box and G.~C.~Tiao, \textit{Bayesian Inference in Statistical Analysis}, Reading, MA: Addison-Wesley (1973, Section 8.5.4, and in particular equation (8.5.43)). \nextq For a bivariate normal distribution \begin{align*} \log l(\alpha,\beta,\gamma|x,y) & =-\log2\pi+\half\log\delta \\ & \quad-\half\alpha(x-\lambda)^2-\gamma(x-\lambda)(y-\mu)-\half\beta(y-\mu)^2 \end{align*} where $\delta=\alpha\beta-\gamma^2=1/\Delta$. Then \begin{align*} (\partial/\partial\alpha)\log l & =\half\beta/\delta-\half(x-\lambda)^2 \\ (\partial/\partial\beta)\log l & ==\half\alpha/\delta-\half(y-\mu)^2, \\ (\partial/\partial\gamma)\log l & =-\gamma/\delta-(x-\lambda)(y-\mu), \end{align*} Consequently the information matrix, i.e.\ minus the matrix of second derivatives (taking expectations is trivial as all elements are constant) is \begin{align*} I & =\left(\begin{array}{ccc} \smallhalf\beta^2\delta^{-2} & -\smallhalf\delta^{-1}+\smallhalf\alpha\beta\delta^{-2} & -\beta\gamma\delta^{-2} \\ -\smallhalf\delta^{-1}+\smallhalf\alpha\beta\delta^{-2} & \smallhalf\alpha^2\delta^{-2} & -\alpha\gamma\delta^{-2} \\ -\beta\gamma\delta^{-2} & -\alpha\gamma\delta^{-2} & \delta^{-1}+2\gamma^2\delta^{-2} \end{array}\right) \\ & =\frac{1}{2\delta^2}\left(\begin{array}{ccc} \beta^2 & \gamma^2 & -2\beta\gamma \\ \gamma^2 & \alpha^2 & -2\alpha\gamma \\ -2\beta\gamma & -2\alpha\gamma & 2(\alpha\beta+\gamma^2) \end{array}\right) \end{align*} so that its determinant is \[ \det I=\frac{1}{8\delta^6}\left|\begin{array}{ccc} \beta^2 & \gamma^2 & -2\beta\gamma \\ \gamma^2 & \alpha^2 & -2\alpha\gamma \\ -2\beta\gamma & -2\alpha\gamma & -2(\alpha\beta+\gamma^2) \end{array}\right| \] Adding $\half\beta/\gamma$ times the last column to the first and $\half\gamma/\beta$ times the last column to the second we get \begin{align*} \det I & =\frac{1}{8\delta^6}\left(\begin{array}{ccc} 0 & 0 & -2\beta\gamma \\ -\delta & \alpha\beta^{-1}\delta & -2\alpha\gamma \\ \beta\gamma^{-1}\delta & -\beta^{-1}\gamma\delta & 2(\alpha\beta+\gamma^2) \end{array}\right) \\ & =\frac{1}{8\delta^6}(-2\beta\gamma) \left(-\frac{\alpha}{\gamma}\delta^2+\frac{\gamma}{\beta}\delta^2\right) =\frac{1}{4\delta^3} \end{align*} We thus conclude that $\det I=1/4\delta^3$ implying a reference prior \[ p(\alpha,\beta,\gamma\propto\delta^{-3/2}. \] Rather similarly we get \begin{align*} \frac{\partial(\alpha,\beta,\gamma)}{\partial(\phi,\psi,\kappa)} & =\left|\begin{array}{ccc} -\psi^2/\Delta^2 & 1/\Delta-\phi\psi/\Delta^2 & 2\psi\kappa/\Delta^2 \\ 1/\Delta-\phi\psi/\Delta^2 & -\phi^2/\Delta^2 & 2\phi\kappa/\Delta^2 \\ \psi\kappa/\Delta^2 & \phi\kappa/\Delta^2 & -1/\Delta-2\kappa^2/\Delta^2 \end{array}\right| \\ & =\frac{1}{\Delta^6}\left|\begin{array}{ccc} \psi^2 & \kappa^2 & -2\psi\kappa \\ \kappa^2 & \phi^2 & -2\phi\kappa \\ -\psi\kappa & -\phi\kappa & \phi\psi+\kappa^2 \end{array}\right| \end{align*} and evaluate this as $-1/\Delta^3$. It then follows that \[ p(\phi,\psi,\kappa)\propto |-1/\Delta^3|\,p(\alpha,\beta,\gamma) \propto\Delta^{-3/2}. \] \nextq Age is $y$, weight is $x$. $n=12$, $\mean x=2911$, $\mean y=38.75$, $S_{xx}=865,127$, $S_{yy}=36.25$, $S_{xy}=4727$. Hence $a=\mean y=38.75$, $b=S_{xy}/S_{xx}=0.005464$, $r=S_{xy}/\sqrt{(S_{xx}S_{yy})}=0.8441$, $S_{ee}=S_{yy}-S_{xy}^2/S_{xx}=S_{yy}(1-r^2)=10.422$, and $s^2=S_{ee}/(n-2)=1.0422$. Take $x_0=3000$ and get $a+b(x_0-\mean x)=39.24$ as mean age for weight 3000. For a particular baby, note that the 95\% point of $\t_{10}$ is 1.812 and $s\sqrt{\phantom{|}}\{1+n^{-1}+(x_0-\mean x)^2/S_{xx}\}=1.067$ so a 90\% interval is $39.24\pm1.812\times1.067$, that is, $(37.31,41.17)$. For the mean weight of all such babies note $s\sqrt{\phantom{|}}\{n^{-1}+(x_0-\mean x)^2/S_{xx}\}=0.310$, so interval is $(38.68, 39.80)$. \nextq From the formulae in the text \[ S_{ee}=S_{yy}-S_{xy}^2/S_{xx}=S_{yy}-2S_{xy}^2/S_{xx}+S_{xy}^2/S_{xx} = S_{yy}-2bS_{xy}+b^2S_{xx} \] where $b=S_{xy}/S_{xx}$. Hence \begin{align*} S_{ee} & =\sum(y_i-\mean y)^2-2b\sum(y_i-\mean y)(x_i-\mean x) +b^2\sum(x_i-\mean x)^2 \\ & =\sum\{(y_i-\mean y)-b(x_i-\mean x)\}^2. \end{align*} The result now follows as $y_i=a$. \nextq Clearly (as stated in the hint) $\sum u_i=\sum v_i=\sum u_iv_i=0$, hence $\mean u=\mean v=0$ and $S_{uv}=0$. We now proceed on the lines of Section 6.3, redefining $S_{ee}$ as $S_{yy}-S{uy}^2/S_{uu}-S_{vy}^2/S_{vv}$ and noting that \begin{align*} & \sum (y_i-\alpha-\beta u_i-\gamma v_i)^2= \sum\left\{(y_i-\mean y)+(\mean y-\alpha)- \beta u_i-\gamma v_i\right\}^2 \\ & =S_{yy}+n(y_i-\mean y)^2+\beta^2S_{uu}+\gamma^2S_{vv} -2\beta S_{uy}-2\gamma S_{vy} \\ & =S_{yy}-S_{uy}^2/S_{uu}-S_{vy}^2/S_{vv}+n(y_i-\mean y)^2 \\ & \qquad +S_{uu}(\beta-S_{uy}/S_{uu})^2+S_{vv}(\gamma-S_{vy}/S_{vv})^2 \\ & =S_{ee}+n(y_i-\mean y)^2+ S_{uu}(\beta-S_{uy}/S_{uu})^2+S_{vv}(\gamma-S_{vy}/S_{vv})^2 \end{align*} We consequently get a density $p(\alpha,\beta,\gamma,\phi\,|\,\vect x,\vect y)$ from which we integrate out \textit{both} $\beta$ and $\gamma$ to get \[ p(\alpha,\phi\,|\,\vect x,\vect y)\propto\phi^{-n/2} \exp[-\half\{S_{ee}+n(\alpha-a)^2\}/\phi]. \] The result now follows. \nextq $I=4$, $\sum K_i=28$, $G=1779$, $\sum\sum x_{ik}^2=114,569$, $C=G^2/N=113,030$. Further, $S_T=1539$, $S_t=(426^2+461^2+450^2+442^2)/7-C=93$ and hence the analysis of variance table is as follows: {\catcode`?=\active \def?{\kern\digitwidth} \begin{center} \begin{tabular}{lllll} \multicolumn{5}{c}{ANOVA Table} \\ \hline & & Degrees \\ & Sum of & of & Mean \\ Source & squares & freedom & square & Ratio \\ \hline Treatments & ??93 & ?6 & ?31 & $( < 1)$ \\ Error & 1446 & 24 & ?60 \\ \hline TOTAL & 1539 & 27 \end{tabular} \end{center} } \noindent We conclude that the results from the four samples agree. \nextq $d=\theta_2+\theta_4+\theta_6-\theta_3-\theta_5-\theta_7$ with $\est d=-1.4$ and $K_d=(6/4)^{-1}$ so that $0.82(d+1.4)/2.74\sim\t_{25}$. \nextq The analysis of variance table is as follows: {\catcode`?=\active \def?{\kern\digitwidth} \begin{center} \begin{tabular}{lllll} \multicolumn{5}{c}{ANOVA Table} \\ \hline & & Degrees \\ & Sum of & of & Mean \\ Source & squares & freedom & square & Ratio \\ \hline Treatments & ?49,884 & ?2 & 24,942 & 13.3 \\ Blocks & 149,700 & ?5 & 29,940 & 16.0 \\ Error & ?18,725 & 10 & ?1,872 \\ \hline TOTAL & 218,309 & 17 \end{tabular} \end{center} } \noindent We conclude that the treatments differ significantly (an $\F_{2,10}$ variable exceeds 9.43 with probability 0.5\%). \nextq This is probably seen most clearly by example. When $r=t=2$ we take \[ \vect x=\left(\begin{array}{c}x_{111}\\x_{112}\\x_{121}\\x_{122}\\ x_{211}\\x_{212}\\x_{221}\\x_{222} \end{array}\right),\quad \matr A=\left(\begin{array}{cccccc} 1 & 1 & 0 & 1 & 0 & 0 \\ 1 & 1 & 0 & 1 & 0 & 0 \\ 1 & 1 & 0 & 0 & 1 & 1 \\ 1 & 1 & 0 & 0 & 1 & 1 \\ 1 & 0 & 1 & 1 & 0 & 1 \\ 1 & 0 & 1 & 1 & 1 & 1 \\ 1 & 0 & 1 & 0 & 1 & 0 \\ 1 & 0 & 1 & 0 & 1 & 0 \\ \end{array} \right),\quad \vect\beta=\left(\begin{array}{c}\mu\\ \alpha_1 \\ \alpha_2 \\ \beta_1 \\ \beta_2 \\ \kappa_{12} \end{array}\right).\] \nextq Evidently $\matr A^{+}\matr A=\matr I$ from which (a) and (b) and (d) are immediate. For (c), note $\matr A\matr A^{+}=\matr A(\matr A\transpose\matr A)^{-1}\matr A\transpose$ which is clearly symmetric. \nextq We take \[ \matr A=\left(\begin{array}{cc}1 & x_1\\1 & x_2\\ \multicolumn{2}{c}{\ddots}\\ 1 & x_n\end{array}\right),\qquad \matr A\transpose\matr A=\left(\begin{array}{cc} n & \sum x_i\\ \sum x_i & \sum x_i^2\end{array}\right) \] so that writing $S=\sum(x_i-\mean x)^2$ we see that $\det(\matr A\transpose\matr A)=nS$ and hence \[ (\matr A\transpose\matr A)^{-1}=\frac{1}{nS}\left(\begin{array}{cc} \sum x_i^2 & -\sum x_i\\ -\sum x_i & n\end{array}\right),\qquad \matr A\transpose\vect y=\left(\begin{array}{c} \sum y_i\\ \sum x_iy_y\end{array}\right) \] from which $\est\beeta=(\matr A\transpose\matr A)^{-1}\matr A\transpose\vect y$ is easily found. In particular \[ \est\eta_1=\frac{1}{nS}\left(-\sum x_i\sum y_i+n\sum x_iy_i\right)= \frac{\sum(y_i-\mean y)(x_i-\mean x)}{\sum(x_i-\mean x)^2}. \] \section {Exercises on Chapter \arabic{section}} \startqs \nextq Recall that $t$ is sufficient for $\theta$ given $x$ if $p(x|\theta)=f(t,\theta)g(x)$ (see Section 2.9). Now \[ p(t|\theta)=\left\{\begin{array}{ll} p(x|\theta)+p(y|\theta)=p(z|\theta) & \text{if $t=t(z)$} \\ p(x|\theta) & \text{if $t=t(x)$ for $x\neq y$} \end{array}\right. \] so that \[ p(t|\theta)=\left\{\begin{array}{ll} 0 & \text{if $x=y$} \\ p(x|\theta) & \text{if $x=t$} \end{array}\right. \] Setting $f(t,\theta)=p(t|\theta)$ and $g(x)=1$ ($x\neq y$), $g(y)=0$, it follows that $t$ is sufficient for $x$ given $\theta$. It then follows from a na\"\i ve appliaction of the weak sufficiency principle that $\Ev(E,y,\theta)=\Ev(E,z,\theta)$. However if $\random x$ is a continuous random variable, then $p_{\random x}(y|\theta)=0$ for all $y$, so we may take $y$ and $z$ as \textit{any} two possible values of $\random x$. \nextq $l(\theta\,|\,g(x))=l(\theta\,|\,h(x))$ from which the result follows. \nextq The likelihood function is easily seen to be \[ l(\theta|x)=\frac{1}{3} \text{\qquad for\qquad} \theta= \left\{\begin{array}{ll} x/2,\ 2x,\ 2x+1 & \text{when $x$ is even} \\ (x-1)/2,\ 2x,\ 2x+1 & \text{when $x\neq1$ is odd} \\ 1,\ 2,\ 3 & \text{when $x=1$.} \end{array}\right. \] The estimators $d_1$, $d_2$ and $d_3$ corresponding to the smallest, middle and largest possible $\theta$ are \[ d_1(x)= \left\{\begin{array}{ll} x/2 & \text{when $x$ is even} \\ (x-1)/2 & \text{when $x\neq1$ is odd} \\ 1 & \text{when $x=1$,} \end{array}\right. \] and $d_2(x)=2x$, $d_3(x)=2x+1$. The values of $p(d_1=\theta)$ given in the question now follow. However, a Bayesian analyis leads to a posterior \[ p(\theta|x)=\frac{l(\theta|x)\pi(\theta)}{p(x)}= \frac{\pi(\theta)I_A(\theta)}{\pi(d_1(x))+\pi(d_2(x))+\pi(d_3(x))} \] where $\pi(\theta)$ is the prior, $A=\{d_1(x), d_2(x), d_3(x)\}$ and $I_A(\theta)$ is the indicator function of $A$. Thus, indeed, the data conveys nothing to the Bayesian version except that $\theta$ is $d_1(x)$, $d_2(x)$, or $d_3(x)$. However, Bayesians are indifferent between $d_1(x)$, $d_2(x)$, or $d_3(x)$ only if they have equal prior probabilities, which cannot hold for all $x$ if the prior is proper. For a discussion, see J.~O.~Berger and R.~L.~Wolpert, \textit{The Likelihood Principle}, Hayward, CA: Institute of Mathematical Statistics (1984 and 1988, Example 34). \nextq Computation of the posterior is straightforward. For a discussion, see J.~O.~Berger and R.~L.~Wolpert \textit{The Likelihood Principle}, Hayward, CA: Institute of Mathematical Statistics (1984 and 1988, Example 35). \nextq Rules (a) and (b) \textit{are} stopping times and rule (c) is \textit{not}. \nextq $n=4$, $\mean x=1.25$, $z=1.25\sqrt{4}=2.50$. \blankline (a)\quad Reject at the 5\% level (and indeed possibly do so on the basis of the first observation alone). \blankline (a)\quad Since $B=(1+n\psi/\phi)^{\frac{1}{2}}\exp[-\half z^2(1+\phi/n\psi)^{-1}]=1.07$ with $\psi=\phi$, so with $\pi_0=\half$ we get $p_0=(1+B^{-1})^{-1}=0.52$. Null hypothesis still more probable than not. \nextq As we do \textit{not} stop the first four times but \textit{do} the fifth time \begin{align*} p(\vect x|\lambda) & =\frac{\lambda^{3+1+2+5+7}}{3!\,1!\,5!\,2!\,7!} \exp(-5\lambda)\frac{3}{3}\frac{1}{4}\frac{2}{6}\frac{5}{11}\frac{11}{18} \\ l(\lambda|\vect x) & \propto \lambda^{10}\exp(-5\lambda). \end{align*} \nextq $\E S=\E (S+1)-1$ and after re-arrangement $\E(S+1)$ is $(s+1)(R''-2)/(r''-2)$ times a sum of probabilities for the beta-Pascal distribution with $S$ replaced by $(S+1)$, with $s$ replaced by $(s+1)$, and with $r''$ replaced by $(r''-1)$. As probabilities sum to unity, the result follows. \nextq Up to a constant \[ L(\pi|x,y)=\log l(\pi|x,y)=(x+n)\log\pi+(n-x+y)\log(1-\pi) \] so $-(\partial^2L(\pi)|x,y)/\partial\pi^2)=(x+n)/\pi^2+(n-x+y)/(1-\pi)^2$. Because the expectations of $x$ and $y$ are $n\pi$ and $n(1-\pi)/\pi$ respectively, we get $I(\pi|x,y)=n(1+\pi)/\pi^2(1-\pi)$, so that Jeffreys' rule leads to \[ p(\pi|x,y)\propto (1+\pi)^{\frac{1}{2}}\pi^{-1}(1-\pi)^{-\frac{1}{2}}. \] \nextq $\E u(x)=\sum u(x)p(x)=\sum u(x)2^{-x}$ suggesting you would enter provided $e < \E u(x)$. If $u(x)\propto x$ then $\E u(x)\propto\sum2^x2^{-x}=\infty$ resulting in the implausible proposition that you would pay an arbitrarily large entry fee $\pounds e$. \nextq By differentiation of the log-likelihood $L(\pi|x)=x\log\theta+(n-x)\log(1-\theta)$ with respect to $\theta$ we see that $x/n$ is the maximum likelihood estimator. Because prior for $\theta$ is uniform, that is, $\Be(1,1)$, posterior is $\Be(x+1,n-x+1)$. The question deals with a particular case of weighted quadratic loss, so we find $d(x)$ as \[ \E^w(\theta|x)= \frac{\E((1-\theta)^{-1}|x)}{\E (\theta^{-1}(1-\theta)^{-1}\,|\,x)}= \frac{\Betafn(x+1,\,n-x)}{\Betafn(x+1,\,n-x+1)} \frac{\Betafn(x+1,\,n-x+1)}{\Betafn(x,\,n-x)}=\frac{x}{n}. \] If $x=0$ or $x=n$ then the posterior loss $\rho(a,x)$ is infinite for all $a$ because the integral diverges at $x=0$ or at $x=n$ so the minimum is not well-defined. \nextq The minimum of $\E (\pi-\rho)^2-2a(\E (\pi-\rho)+a^2$ clearly occurs when $a=\E (\pi-\rho)$. But since the prior for $\pi$ is uniform, that is, $\Be(1,1)$, its posterior is $\Be(x+1,n-x+1)$ and so its posterior mean is $(x+1)/(n+2)$; similarly for $y$. We conclude that the Bayes rule is \[ d(x,y)=(x-y)/(n+2). \] \nextq Posterior mean (resulting from quadratic loss) is a weighted mean of the component means, so with the data in Section 2.10 is \[ \alpha'\frac{10}{10+20}+\beta'\frac{20}{10+20}= \frac{115}{129}\frac{10}{10+20}+\frac{14}{129}\frac{20}{10+20}=0.370. \] Posterior median (resulting from absolute error loss) can be found by computing the weighted mean of the distribution functions of the two beta distributions for various values and honing in. Result is 0.343. Posterior mode (resulting from zero-one loss) is not very meaningful in a bimodal case, and even in a case where the posterior is not actually bimodal it is not very useful. \nextq If we take as loss function \[ L(\theta, a) = \left\{\begin{array}{ll} u(a-\theta) & \text{if $a\leqslant\theta$} \\ v(\theta-a) & \text{if $a\geqslant\theta$} \end{array}\right. \] Suppose $m(x)$ is a $v/(u+v)$ fractile, so that \[ \Pr\bigl(x\leqslant m(x)\bigr)\geqslant v/(u+v),\qquad \Pr\bigl(x\geqslant m(x)\bigr)\geqslant u/(u+v). \] Suuppose further that $d(x)$ is any other rule and, for definiteness, that $d(x) > m(x)$ for some particular $x$ (the proof is similar if $d(x) < m(x)$). Then \[ L(\theta, m(x)) - L(\theta, d(x)) = \left\{\begin{array}{ll} u[m(x)-d(x)] & \text{if $\theta\leqslant m(x)$} \\ (u+v)\theta - [um(x) + vd(x)] & \text{if $m(x) < \theta < d(x)$} \\ v[d(x)-m(x)] & \text{if $\theta\geqslant d(x)$} \end{array}\right. \] while for $m(x) < \theta < d(x)$ \[ (u+v)\theta-[um(x)+vd(x)] < u[\theta-m(x)] < u[d(x)-m(x)] \] so that \[ L(\theta,m(x))-L(\theta,d(x))\leqslant\left\{\begin{array}{ll} u[m(x)-d(x)] & \text{if $\theta\leqslant m(x)$} \\ v[d(x)-m(x)] & \text{if $\theta > m(x)$} \end{array}\right. \] and hence on taking expectations over $\theta$ \begin{align*} \rho(m(x),x)-\rho(d(x),x) & \leqslant \bigl\{u[m(x)-d(x)]\bigr\}\Pr(\theta\leqslant m(x)\,|\,x) \\ & \qquad+ \bigl\{v[d(x)-m(x)]\bigr\} \Pr(\theta > m(x)\,|\,x) \\ & = \bigl\{d(x)-m(x)\bigr\} \bigl\{-u\Pr(\theta\leqslant m(x)\,|\,x) \\ & \qquad+ v\Pr(\theta\geqslant m(x)\,|\,x)\bigr\} \\ & \leqslant\bigl\{d(x)-m(x)\bigr\}\bigl\{-uv/(u+v)+uv/(u+v)\bigr\} = 0 \end{align*} from which it follows that taking a $v/(u+v)$ fractile of the posterior distribution does indeed result in the appropriate Bayes rule for this loss function. \nextq By integration by parts, if $\theta\sim\Be(2,k)$ then \begin{align*} \Pr(\theta<\alpha) & = \int_0^{\alpha} k(k+1)\theta(1-\theta)^k\dtheta \\ & = \left[-(k+1)(1-\theta)^k\theta\right]_0^{\alpha}+ \int_0^{\alpha}(k+1)(1-\theta)^k\dtheta \\ & = [-(k+1)(1-\theta)^k\theta-(1-\theta)^{k+1} = 1-(1-\theta)^k(1+k\alpha). \end{align*} In this case, the prior for $\theta$ is $\Be(2,12)$ so that the prior probability of $\Hyp_0$, that is, that $\theta < 0.1$, is 0.379, while the posterior is $\Be(2,18)$ so that the posterior probability that $\theta < 0.1$ is 0.580. \blankline (a)\quad With ``0--1'' loss we accept the hypothesis with the greater posterior probability, in this case $\Hyp_1$. \blankline (b)\quad The second suggested loss function is of the ``0--$K_i$'' form and the decision depends on the relative sizes of $p_1$ and $2p_0$. Again this results in a decision in facour of $\Hyp_1$. \nextq Posterior expected losses are \[ \rho(a_0,x)=10p_1,\qquad\rho(a_1,x)=10p_0,\qquad\rho(a_2,x)=3p_0+3p_1. \] Choose $a_0$ if $0\leqslant p_0\leqslant0.3$, choose $a_1$ if $0.3\leqslant p_0\leqslant0.7$ and choose $a_2$ if $0.7\leqslant p_0\leqslant1$. \nextq Posterior variance is $(225^{-1}+100^{-1})^{-1}=69.23=8.32^2$ and the posterior mean is $69.23(100/225+115/100)=110.38$. Posterior expected losses are \[ p(a_1,x)=\int_{90}^{110} (\theta-90)\pi(\theta|x)\dtheta+ \int_{110}^{\infty} 2(\theta-90)\pi(\theta|x)\dtheta \] Now note that (with $\phi(x)$ the $\N(0,1)$ density function \begin{align*} \int_{90}^{110} (\theta-90)\pi(\theta|x)\dtheta & = \sqrt{69.23}\int_{-2.450}^{-0.046} z\phi(z)\dtheta+ 20.38\int_{-2.450}^{-0.046} \phi(z)\dtheta \\ & = -\sqrt{(69.23/2\pi)}\{\exp(-\half0.046^2)-\exp(-\half2.450^2)\} \\ & \qquad+20.39\{\Phi(-0.046)-\Phi(-2.450)\} \\ & = -3.15+9.66=6.51. \end{align*} By similar manipulations we find that $\rho(a_1,x)=34.3$, $\rho(a_2,x)=3.6$ and $\rho(a_3,x)=3.3$, and thus conclude that $a_3$ is the Bayes decision. \nextq In the negative binomial case \begin{align*} p(\pi|x) & = p(\pi) p(x|\pi)/p_{\random x}(x) \\ & = \binom{n+x-1}{x}(1-\pi)^n\pi^x/p_{\random x}(x) \end{align*} It follows that the posterior mean is \begin{align*} \E(\pi|x) & = \int \pi\binom{n+x-1}{x}(1-\pi)^n\pi^x/p_{\random x}(x) \\ & = \frac{x+1}{n+x}\binom{n+x}{x+1}(1-\pi)^n\pi^x/p_{\random x}(x) \\ & =\frac{(x+1)}{(n+x)}\frac{p_{\random x}(x+1)}{p_{\random x}(x)} \end{align*} This leads in the same way as in Section 7.5 to the estimate \[ \delta_n=\frac{(\xi+1)f_n(\xi+1)}{(n+\xi)(f_n(\xi)+1)}. \] \section {Exercises on Chapter \arabic{section}} \startqs \nextq Write $\gamma=\alpha/(\alpha+\beta)$ and $\delta=(\alpha+\beta)^{-1/2}$. The Jacobian is \[ \left|\begin{array}{cc} \partial\gamma/\partial\alpha & \partial\gamma/\partial\beta \\ \partial\delta/\partial\alpha & \delta\gamma/\partial\beta \end{array}\right| =\left|\begin{array}{cc} \beta/(\alpha+\beta)^2 & -\alpha/(\alpha+\beta)^2 \\ -\half(\alpha+\beta)^{-3/2} & -\half(\alpha+\beta)^{-3/2} \end{array}\right| =-(\alpha+\beta)^{-5/2} \] from which the result follows. \nextq $p(\phi,\btheta\,|\,\vect x)\propto p(\phi)\,p(\btheta|\phi)\,p(\vect x|\btheta)$. So \[ p(\phi,\btheta\,|\,\vect x)\propto \phi^n\prod\theta_i^{x_i}\exp(-(\phi+1)\theta_i) \] and by the usual change--of-variable rule (using $|\!\dz/\dphi|=1/(1+\phi)^2$) \[ p(z,\btheta\,|\,\vect x)\propto z^{-n-2}(1-z)^n\prod\theta_i^{x_i}\exp(-\theta_i/z). \] Integrating with respect to all the $\theta_i$ using $\int\theta^x\exp(-\theta/z)\propto z^{x+1}$ we get \[ p(z|\vect x)\propto z^{n\mean x-2}(1-z)^n \] or $z\sim\Be(n\mean x-1,n+1)$, from which it follows that $\E z=(n\mean x-1)/(n\mean x+n)$. Now note that \[ p(\theta_i\,|\,x_i,\phi)\propto p(x_i|\theta_i)\,p(\theta_i|\phi) \propto\theta_i^{x_i}\exp(-\theta_i)\,\phi\exp(-\phi\theta_i) \propto \theta_i^{x_i}\exp(-\theta_i/z) \] which is a $\G(x_i,z)$ or $\half z\chi_{x_i}^2$ distribution, from which it follows that $\E(\theta_i\,|\,x_i,\phi)=x_iz$ and so \[ \E(\theta_i|\vect x)=x_i\E(z|\vect x)=x_i(n\mean x-1)/(n\mean x+n). \] \nextq \!\!(a) Use same estimator $\est\btheta=\bhthetab$. Expression for $\rho(\bhthetao,\vect X)$ has $S_1$ replaced by a weighted sum of squares about $\mu$. \blankline (b) Use estimator $\est\btheta$ with $\est\theta_i$ the posterior median of the $i$th component. \nextq We find the mean square error of the transformed data is 3.14 times smaller and of the equivalent probabilities is 3.09 (as opposed to corresponding figures of 3.50 and 3.49 for the Efron-Morris estimator) For a complete analysis, see the program \bigskip \noindent \texttt{http://www-users.york.ac.uk/\texttildelow pml1/bayes/rprogs/baseball.txt} \bigskip \noindent or \bigskip \noindent \texttt{http://www-users.york.ac.uk/\texttildelow pml1/bayes/cprogs/baseball.cpp} \bigskip \nextq Evidently $\matr A\transpose\matr A=\matr I$ or equivalently $\vect\alpha_j\transpose\vect\alpha_k=1$ if $j=k$ and 0 if $j\ne k$. It follows that $W_j\sim\N(0,1)$ and that $\Cov(W_j,W_k)=0$ if $j\ne k$ which, since the $W_j$ have a multivariate normal distribution, implies that $W_j$ and $W_k$ are independent. Further, as $\matr A\transpose\matr A=\matr I$ we also have $\matr A\matr A\transpose=\matr I$ and so \[ \matr W\transpose\matr W= (\vect X-\bmu)\transpose\matr A\matr A\transpose(\vect X-\bmu)= (\vect X-\bmu)\transpose(\vect X-\bmu). \] For the last part, normalize the $\alpha_{ij}$ for fixed $j$ to produce a unit vector and the rest follows as above. \nextq We know that \[ R(\theta,\bhthetajs)-R(\theta,\bhthetajsplus)= \frac{1}{r}\sum\left[\E\left(\est\theta^{JS^2}-\est\theta^{JS^{{}_+2}}\right) -2\theta_i\E\left(\est\theta_i^{JS}-\est\theta_i^{JS^{{}_+}}\right)\right]. \] To show that the experession in brackets is always $ > 0$, calculate the expectation by first conditioning on $S_1$. For any value $S_1\leqslant r-2$, we have $\est\theta_i^{JS}=\est\theta_i^{JS^{{}_+}}$ so that it is enough to show that the right hand side is positive when conditional on any value $S_1=s_1 > r-2$. Since in that case $\est\theta_i^{JS^{{}_+}}=0$, it is enough to show that for any $s_1 > r-2$ \[ \theta_i\E\left[\left.\est\theta_i^{JS}\,\right|\,S_1=s_1\right]= \theta_i\left(1-\frac{r-2}{s_1}\right)\E(X_i-\mu_i\,|\,S_1=s_1)\leqslant0 \] and hence it is enough to show that $\theta_i\E(X_i-\mu_i\,|\,S_1=s_1)\geqslant0$. Now $S_1=s_1$ is equivalent to $(X_1-\mu)^2=s_1-\left\{(X_2-\mu)^2+\dots+(X_n-\mu)^2\right\}$. Conditioning further on $X_2$, \dots, $X_n$ and noting that \begin{align*} \Pr(X_1=y\,|\,S_1=s_1,X_2=x_2,\dots,X_n=x_n) & \propto \exp\left(-\half(y-\theta_1)^2\right) \\ \Pr(X_1=-y\,|\,S_1=s_1,X_2=x_2,\dots,X_n=x_n) & \propto \exp\left(-\half(y+\theta_1)^2\right) \end{align*} we find that \[ \E\left[\theta_1(X_1-\mu_1)\,|\,S_1=s_1,X_2=x_2,\dots X_n=x_n\right]= \frac{\theta_1 y(\text{e}^{\theta_1y}-\text{e}^{-\theta_1y})} {\text{e}^{\theta_1y}+\text{e}^{-\theta_1y}} \] where $y=\sqrt{s_1-(x_2-\mu)^2-\dots-(x_n-\mu)^2}$. The right hand side is an increasing function of $|\theta_1y|$, which is zero when $\theta_1y=0$, and this completes the proof. \nextq Note that, as $\matr A\transpose\matr A+k\matr I$ and $\matr A\transpose\matr A$ commute, \begin{align*} \bhtheta-\bhtheta_k & =\left\{(\matr A\transpose\matr A+k\matr I)- (\matr A\transpose\matr A)\right\} (\matr A\transpose\matr A)^{-1} (\matr A\transpose\matr A+k\matr I)^{-1}\matr A\transpose\vect x \\ & =k(\matr A\transpose\matr A)^{-1}\bhtheta_k. \end{align*} The bias is \[ \vect b(k)=\left\{(\matr A\transpose\matr A+k\matr I)^{-1} \matr A\transpose\matr A-\matr I\right\}\btheta, \] and the sum of the squares of the biases is \[ \mathcal G(k)=\vect b(k)\transpose\vect b(k)= (\E\bhtheta_k-\btheta)\transpose(\E\bhtheta_k-\btheta). \] The variance-covariance matrix is \[ \Var(\bhtheta_k)=\phi(\matr A\transpose\matr A+k\matr I)^{-1} \matr A\transpose\matr A(\matr A\transpose\matr A+k\matr I)^{-1} \] so that the sum of the variances of the regression coefficients is \begin{align*} \mathcal F(k) & =\E(\bhtheta_k-\E\btheta)\transpose(\bhtheta_k-\E\btheta) \\ & =\text{Trace}(\Var(\bhtheta_k)) \\ & =\phi\,\text{Trace} \left\{(\matr A\transpose\matr A+k\matr I)^{-1} \matr A\transpose\matr A\, (\matr A\transpose\matr A+k\matr I)^{-1}\right\} \end{align*} and the residual sum of squares is \begin{align*} \textit{RSS}_k & = (\vect x-\matr A\bhtheta_k)\transpose(\vect x-\matr A\bhtheta_k) \\ & =\E\{(\vect x-\matr A\bhtheta)+\matr A(\bhtheta-\bhtheta_k)\}\transpose \{(\vect x-\matr A\bhtheta)+\matr A(\bhtheta-\bhtheta_k)\}. \end{align*} Because $\matr A\transpose(\vect x-\matr A\bhtheta)=\matr A\transpose\vect x- \matr A\transpose\matr A(\matr A\transpose\matr A)^{-1} \matr A\transpose\vect x=\vect 0$, this can be written as \begin{align*} \textit{RSS}_k & = (\vect x-\matr A\bhtheta)\transpose(\vect x-\matr A\bhtheta) +(\bhtheta-\bhtheta_k)\transpose\matr A\transpose\matr A (\bhtheta-\bhtheta_k) \\ & =\textit{RSS}+ k^2\,\bhtheta_k\transpose(\matr A\transpose\matr A)^{-1}\bhtheta_k \end{align*} while the mean square error is \begin{align*} \textit{MSE}_k & =\E(\bhtheta_k-\btheta)\transpose(\bhtheta_k-\btheta) \\ & =\E\{(\bhtheta_k-\E\bhtheta_k)+(\E\bhtheta_k-\btheta)\}\transpose \{(\bhtheta_k-\E\bhtheta_k)+(\E\bhtheta_k-\btheta)\} \\ & =\E(\bhtheta_k-\E\bhtheta_k)\transpose(\bhtheta_k-\E\bhtheta_k)+ (\E\bhtheta_k-\btheta)\transpose(\E\bhtheta_k-\btheta) \\ & =\mathcal F(k) + \mathcal G(k) \end{align*} using the fact that $\E(\bhtheta_k-\E\bhtheta_k)=0$. It can be shown that $\mathcal F(k)$ is a continuous monotonic decreasing function of $k$ with $\mathcal F'(0) < 0$, while $\mathcal G(k)$ is a continuous monotonic increasing function with $\mathcal G(0)=\mathcal G'(0)=0$ which approaches $\btheta\transpose\btheta$ as an upper limit as $k\to\infty$. It follows that there always exists a $k > 0$ such that $\textit{MSE}_k < \textit{MSE}_0$. In fact, this is always true when $k < 2\phi/\btheta\transpose\btheta$ (cf.\ C.~M.~Theobold, ``Generalizations of mean square error applied to ridge regression'', \textit{Journal of the Royal Statistical Society Series} \textbf{B}, \textbf{36} (1974), 103--106). \nextq We defined \[ \matr H^{-1}=\matr\Psi^{-1}-\matr\Psi^{-1}\matr B \left(\matr B\transpose\matr\Psi^{-1}\matr B\right) \matr B\transpose\matr\Psi^{-1} \] so that \[ \matr B\transpose\matr H^{-1}\matr B=\matr B\transpose\matr\Psi^{-1}\matr B -\matr B\transpose\matr\Psi^{-1}\matr B=\matr 0. \] If $\matr B$ is square and non-singular then \[ \matr H^{-1}=\matr\Psi^{-1}-\matr\Psi^{-1}\matr B \matr B^{-1}\matr\Psi\left(\matr B\transpose\right)^{-1} \matr B\transpose\matr\Psi^{-1} =\matr\Psi^{-1}-\matr\Psi^{-1}=\matr 0. \] \nextq It is easily seen that \begin{align*} \matr B\transpose\matr\Psi^{-1} & =\left(\begin{array}{cccc} \psi_{\alpha}^{-1} & \psi_{\alpha}^{-1} & 0 & 0 \\ 0 & 0 & \psi_{\beta}^{-1} & \psi_{\beta}^{-1} \end{array}\right), \\ \matr B\transpose\matr\Psi^{-1}\matr B & =\left(\begin{array}{cc} 2\psi_{\alpha}^{-1} & 0 \\ 0 & 2\psi_{\beta}^{-1} \end{array}\right) \end{align*} so that \begin{align*} \matr H^{-1} & =\matr\Psi^{-1}-\matr\Psi^{-1}\matr B \left(\matr B\transpose\matr\Psi^{-1}\matr B\right) \matr B\transpose\matr\Psi^{-1} \\ & =\left(\begin{array}{cccc} \half\psi_{\alpha}^{-1} & -\half\psi_{\alpha}^{-1}& 0 & 0 \\ -\half\psi_{\alpha}^{-1} & \half\psi_{\alpha}^{-1}& 0 & 0 \\ 0 & 0 & \half\psi_{\beta}^{-1} & -\half\psi_{\beta}^{-1} \\ 0 & 0 & -\half\psi_{\beta}^{-1} & \half\psi_{\beta}^{-1} \end{array}\right) \end{align*} while \[ \matr A\transpose\matr A=\left(\begin{array}{cccc} 4 & 0 & 2 & 2 \\ 0 & 4 & 2 & 2 \\ 2 & 2 & 4 & 0 \\ 2 & 2 & 0 & 4 \end{array}\right). \] Now \begin{align*} \matr K^{-1} & =\matr A\transpose\matr\Phi^{-1}\matr A+\matr H^{-1} \\ & =\left(\begin{array}{cccc} 4\phi+\half\psi_{\alpha}^{-1} & -\half\psi_{\alpha}^{-1} & 2\phi & 2\phi \\ -\half\psi_{\alpha}^{-1} & 4\phi+\half\psi_{\alpha}^{-1} & 2\phi & 2\phi \\ 2\phi & 2\phi & 4\phi+\half\psi_{\beta}^{-1} & -\half\psi_{\beta}^{-1} \\ 2\phi & 2\phi & -\half\psi_{\beta}^{-1} & 4\phi+\half\psi_{\beta}^{-1} \end{array}\right) \end{align*} \nextq See D.~V.~Lindley and A.~F.~M.~Smith, ``Bayes estimates for the linear model'' (with discussion), \textit{Journal of the Royal Statistical Society Series} \textbf{B}, \textbf{34} (1971), 1--41 [reprinted in N.~Polson and G.~C.~Tiao, \textit{Bayesian Inference} (2 vols), (\textit{The International Library of Critical Writings in Econometrics}, No.\ 7), Aldershot: Edward Elgar (1995)]. \nextq We have all $a_i=a$ so $\vect u\transpose\vect u=-mb/a$ so $1+\vect u\transpose\vect u=(a-mb)/a$ and thus \begin{align*} \Sigma_{ii} & =\frac{1}{a}\left(1-\frac{1}{1+\vect u\transpose\vect u} \frac{b}{a}\right)=\frac{1}{a}\left(1-\frac{b}{a-mb}\right) =\frac{a-(m+1)b}{a(a-mb)} \\ \Sigma_{ij} & =-\frac{(m+1)b}{a(a-mb)} \end{align*} where $a=n/\phi+1/\phi$ and $b=-1/r\psi$. \section {Exercises on Chapter \arabic{section}} \startqs \nextq A simulation in \textsf{R}, the program for which can be seen on \bigskip {\small \noindent \texttt{http://www-users.york.ac.uk/\texttildelow pml1/bayes/tprogs/integral.txt} } \bigskip \noindent resulted in values 1.684199, 1.516539, 1.7974, 1.921932, 1.595924, 1.573164, 1.812976, 1.880691, 1.641073, 1.770603 with a mean of 1.719450 as as opposed to the theoretical value of $\text{e}-1=1.71828$. The theoretical variance of a single value of $\text{e}^X$ where $X\sim\U(0,1)$ is $\half(\text{e}-1)(3-\text{e})=0.24204$ so that the standard deviation of the mean of 10 values is $\sqrt{0.24204/10}=0.15557$ which compares with a sample standard deviation of 0.1372971. A simulation in C++, the program for which can be seen on \bigskip \noindent {\small \texttt{http://www-users.york.ac.uk/\texttildelow pml1/bayes/cprogs/integral.cpp} } \bigskip \noindent resulted in values 1.74529, 1.86877, 1.68003, 1.95945, 1.62928, 1.62953, 1.84021, 1.87704, 1.49146, 1.55213 with a mean of 1.72732 and a sample standard deviation of 0.155317. \nextq The easiest way to check the value of $\matr A^t$ is by induction. Then note that for large $t$ the matrix $\matr A^t$ is approximately \[ \left(\begin{array}{cc}\slopefrac{2}{5} & \slopefrac{3}{5} \\ \slopefrac{2}{5} & \slopefrac{3}{5}\end{array}\right) \] from which the result follows. \nextq In this case \[ Q=\E[(y_1+y_2-1)\log\eta+(y_3+y_4-1)\log(1-\eta)] \] giving \[ \eta^{(t+1)}=\frac{\E(y_1\,|\,\eta^{(t)},\vect x)+x_2-1} {\E(y_1\,|\,\eta^{(t)},\vect x)+x_2+x_3+\E(y_4\,|\,\eta^{(t)},\vect x)-1} \] where $y_1\sim\B(x_1,\eta/(\eta+1))$ and $y_4\sim\B(x_4,(1-\eta)/(2-\eta))$, so that \begin{align*} \eta^{(t+1)} & =\frac{x_1\eta^{(t)}/(\eta^{(t)}+1)+x_2-1} {x_1\eta^{(t)}/(\eta^{(t)}+1)+x_2+x_3+x_4(1-\eta^{(t)})/(2-\eta^{(t)})-2} \\ & =\frac{461\eta^{(t)}/(\eta^{(t)}+1)+130-1} {461\eta^{(t)}/(\eta^{(t)}+1)+130+161+515(1-\eta^{(t)})/(2-\eta^{(t)})-2}. \end{align*} Starting with $\eta^{(0)}=0.5$, successive iterations give 0.4681, 0.4461, 0.4411, 0.4394, 0.4386, 0.4385, and thereafter 0.4384. This compares with a value of 0.439 found by maximum likelihood in C.~A.~B.~Smith, \textit{op. cit.} \nextq The proof that $p(\eta^{(t+1)}\,|\,\vect x)\geqslant p(\eta^{(t+1)}\,|\,\vect x)$ in the subsection ``Why the \EM\ algorithm works'' only uses the properties of a \GEM\ algorithm. As for the last part, if convergence has occurred there is no further room for increasing $Q$, so we must have reached a maximum. \nextq Take prior for variance $S_0\chi_{\nu}^2$, that is, $350\chi_4^2$, and prior for mean which is $\N(\theta_0,\phi_0)$ where $\theta_0=85$ and $\phi_0=S_0/n_0(\nu-2)=350/(1\times2)=175$. We then have data with $n=100$, $\mean x=89$ and $S_1=(n-1)s^2=2970$. Successive iterations using the \EM\ algorithm give the posterior mean as 88.9793 and at all subsequent stages 88.9862. This compares with 88.96 as found in Exercise 16 on Chapter 2. \nextq Means for the four looms are 97.3497, 91.7870, 95.7272 and 96.8861, overall mean is 97.4375. Variance of observations from the same loom is 1.6250 and variance of means from different looms in the population is 5.1680. \nextq \!\!(a) For the example on genetic linkage in Section 9.3, see \bigskip \noindent \texttt{http://www-users.york.ac.uk/\texttildelow pml1/bayes/rprogs/dataaug.txt} \bigskip \noindent or \bigskip \noindent \texttt{http://www-users.york.ac.uk/\texttildelow pml1/bayes/cprogs/dataaug.cpp} \bigskip \blankline (b) For the example on chained data sampling due to Casella and George, see a similar file with \texttt{chained} in place of \texttt{dataaug}. \blankline (c) For the example on the semi-conjugate prior with a normal likelihood (using both the \EM\ algorithm and chained data augmentation, see a similar file with \texttt{semiconj} in place of \texttt{dataaug}. \blankline (d) For the example on a change-point analysis of data on coal mining disasters, see similar files with \texttt{coalnr.cpp} or \texttt{coal.cpp} in place of \texttt{dataaug.cpp} (the difference is that the former uses only routines in W.~H.~Press \textit{et al.}, \textit{Numerical Recipes in C} (2nd edn), Cambridge: University Press 1992, whereas the latter program, which is in many ways preferable, uses gamma variates with non-integer parameters). \nextq See the program referred to in part (a) of the previous answer. \nextq See the program referred to in part (b) of the answer to the question before last. \nextq See the program \bigskip \noindent \texttt{http://www-users.york.ac.uk/\texttildelow pml1/bayes/rprogs/semicon2.txt} \bigskip \noindent or \bigskip \noindent \texttt{http://www-users.york.ac.uk/$\sim$pml1/bayes/cprogs/semicon2.cpp} \nextq B.~P.~Carlin and T.~A.~Louis, \textit{Bayes and Empirical Bayes Methods for Data Analysis} (2nd edn), Chapman and Hall 2000, p.\,149, remark that ``In our dataset, each rat was weighed once a week for five consecutive weeks. In fact the rats were all the same age at each weighing $x_{i1}=8$, $x_{i2}=15$, $x_{i3}=22$, $x_{i4}=29$, and $x_{i5}=36$ for all $i$. As a result we may simplify our computations by rewriting the likelihood as \[ Y_{ij}\sim N(\alpha_i+\beta_i(x_{ij}-\overline x)\,,\,\sigma^2), \ i=1, ...,k,\ j=1, ..., n_i, \] so that it is now reasonable to think of $\alpha_i$ and $\beta_i$ as independent \textit{a priori}. Thus we may set $\Sigma=\textit{Diag}(\sigma_{\alpha}^2,\sigma_{\beta}^2)$, and replace the Wishart prior with a product of idependent inverse gamma priors, say $\textit{IG}(a_{\alpha},b_{\alpha})$ and $\textit{IG}(a_{\beta},b_{\beta})$.'' This being so, it probably suffices to proceed as in the `Rats' example supplied with WinBUGS. A more detailed description of the general set-up is to be found in Taken from ``Illustration of Bayesian Inference in Normal Data Models Using Gibbs Sampling'', Alan E.~Gelfand, Susan E.~Hills, Amy Racine-Poon, and Adrian F.~M.~Smith, \textit{Journal of the American Statistical Association} \textbf{85} (412) (1990), 972--985, Section 6, pp.\ 978--979, which can be found at \bigskip \noindent \texttt{http://www-users.york.ac.uk/\texttildelow pml1/bayes/rprogs/gelfand.pdf} \bigskip \noindent (\LaTeX source at \texttt{gelfand.htm}). A program for generating random matrices with a Wishart distribution based on the algorithm of Odell and Feiveson described in W.~J.~Kennedy and J.~E.~Gentle, Statistical Computing, Marcel Dekker 1980, pp.\,231--232 can be found at \bigskip \noindent \texttt{http://www-users.york.ac.uk/\texttildelow pml1/bayes/rprogs/wishart.txt} \bigskip while the relevant extract from Kennedy and Gentle is at \bigskip \noindent \texttt{http://www-users.york.ac.uk/\texttildelow pml1/bayes/rprogs/rwishart.pdf} \nextq See the program at \bigskip \noindent \texttt{http://www-users.york.ac.uk/\texttildelow pml1/bayes/rprogs/linkagemh.txt} \nextq See the program at \bigskip \texttt{http://www-users.york.ac.uk/\texttildelow pml1/bayes/winbugs/wheat.txt} \bigskip \noindent or \bigskip \texttt{http://www-users.york.ac.uk/\texttildelow pml1/bayes/rprogs/wheat.txt} \nextq See the program at \bigskip \texttt{http://www-users.york.ac.uk/\texttildelow pml1/bayes/winbugs/logisti2.txt} \bigskip \noindent or \noindent \texttt{http://www-users.york.ac.uk/\texttildelow pml1/bayes/rprogs/logisti2.txt} \section {Exercises on Chapter \arabic{section}} \startqs \nextq For sampling with the Cauchy distribution we proceed thus: \begin{description} \item[\quad(a)] Substituting $x=\tan\theta$ we find \begin{align*} \eta &= \Pr(x > 2) = \int_2^{\infty} \frac{1}{\pi}\frac{1}{1+x^2}\dx =\half-\tan^{-1}(2)/\pi = \tan^{-1}(\half)/\pi \\ &= 0.147\,583\,6. \end{align*} The variance is \[ \eta(1-\eta)/n = 0.125\,802\,7/n. \] \item[\quad(b)] Use the usual change of variable rule to deduce the density of $y$. Then note that \[ \frac{q(y_i)}{p(y_i)}=\frac{1/(\pi(1+y_i^2))}{2/y_i^2} =\frac{1}{2\pi}\frac{y_i^2}{1+y_i^2} \] and that we can take $f(y)=1$ as all values of $y$ satisfy $y\geqslant 2$. \item[\quad(c)] Substitute $x_i=2/y_i$ in the above to get \[ \frac{1}{2\pi}\frac{4}{4+x_i^2}. \] \item[\quad(d)] On writing $x=2\tan\theta$ and $\theta_0=\tan^{-1}(\half)$ we find \[ \E\widehat\eta=\int_0^1\frac{1}{2\pi}\frac{4}{4+x^2}\dx = \int_0^1\frac{1}{2\pi}\frac{1}{\sec^2\theta}2\sec^2\theta\dtheta = \theta_0/\pi = \eta\] Similarly, noting that $\sin(2\theta_0)=4/5$ the integral for $\E\widehat\eta^2$ transforms to \begin{align*} \int_0^1\frac{1}{4\pi^2}\frac{1}{\sec^4\theta}2\sec^2\theta\dtheta &= \int_0^1\frac{1}{4\pi^2} 2\cos^2\theta\dtheta \\ &= \int_0^1\frac{1}{4\pi^2}\{1+\cos(2\theta)\}\dtheta \\ &= \left[\theta+\half\sin(2\theta)\right]_0^{\theta_0}/(4\pi)^2 \\ &= \left\{\tan^{-1}(\half)+\twofifths\right\}/(4\pi^2) \\ &= 0.021\,876\,4 \end{align*} so that \begin{align*} \Var\widehat\eta = \E\widehat\eta^2 - (\E\widehat\eta)^2 = 0.000\,095\,5. \end{align*} \end{description} \nextq A suitable program is \begin{tabbing} \hspace*{2ex} \=\hspace{2ex}\=\hspace{2ex}\=\hspace{2ex}\=\hspace{2ex}\=\hspace{2ex}\=\hspace{2ex}\=\hspace{2ex}\=\hspace{2ex}\=\hspace{2ex}\kill \> \prog{n <- 10000} \\ \> \prog{alpha <- 2} \\ \> \prog{beta <- 3} \\ \> \prog{x <- runif(n)} \\ \> \prog{p <- function(x) dunif(x)} \\ \> \prog{q <- function(x) dbeta(x,alpha,beta)} \\ \> \prog{w <- q(x)/p(x)} \\ \> \prog{pi <- w/sum(w)} \\ \> \prog{samp <- sample(x,size=n,prob=pi,replace=T)} \\ \> \prog{print(mean(samp))} \\ \> \prog{print(var(samp))} \end{tabbing} A run of this program resulted in a value of the mean of 0.399 as compared with the true value of 0.4 and a value of the mean of 0.0399 compared with a true value of 0.04. \nextq Continue the program (avoiding ambiguity in the definition of $\alpha$) by \begin{tabbing} \hspace*{2ex} \=\hspace{2ex}\=\hspace{2ex}\=\hspace{2ex}\=\hspace{2ex}\=\hspace{2ex}\=\hspace{2ex}\=\hspace{2ex}\=\hspace{2ex}\=\hspace{2ex}\kill \> \prog{theta <- 0.1} \\ \> \prog{le <- (1-theta)*n} \\ \> \prog{lo <- 1:(n-le)} \\ \> \prog{hi <- (le+1):n} \\ \> \prog{y <- sort(samp)} \\ \> \prog{r <- y[hi]-y[lo]} \\ \> \prog{rm <- min(r)} \\ \> \prog{lom <- min(lo[r==rm])} \\ \> \prog{him <- min(hi[r==rm])} \\ \> \prog{dd <- function(x) dbeta(x,alpha,beta)} \\ \> \prog{plot(dd,xlim=c(0,1),ylim=c(0,2))} \\ \> \prog{par(new=T)} \\ \> \prog{plot(density(samp),xlim=c(0,1),ylim=c(0,2),} \\ \> \> \prog{main="",xlab="",ylab="")} \\ \> \prog{abline(v=y[lom])} \\ \> \prog{abline(v=y[him])} \\ \> \prog{print(y[lom])} \\ \> \prog{print(y[him])} \end{tabbing} The lower limit resulting from a run of the program was 0.061 and the upper limit was 0.705. Referring to the tables in the Appendix we find that values of $F$ corresponding to a 90\% HDR of $\log \F_{6,4}$ are $\underline\F = 0.21$ and $\overline\F = 5.75$. It follows that an appropriate interval of values of $F_{4,6}$ is $(1/\overline\F, 1/\underline\F)$, that is $(0.17, 4.76)$, so that an appropriate interval for $\pi$ is \[ \frac{2\times 0.17}{3+2\times 0.17}\leqslant\pi\leqslant \frac{2\times 4.76}{3+2\times 4.76} \] that is $(0.10, 0.76)$ (but note that in Section 3.1 we were looking for intervals in which the distribution of the log-odds is higher than anywhere outside, rather than for HDRs for the beta distribution itself). \nextq A suitable program is \begin{tabbing} \hspace*{2ex} \=\hspace{2ex}\=\hspace{2ex}\=\hspace{2ex}\=\hspace{2ex}\=\hspace{2ex}\=\hspace{2ex}\=\hspace{2ex}\=\hspace{2ex}\=\hspace{2ex}\kill \> \prog{r <- 10} \\ \> \prog{phi <- rep(NA,r)} \\ \> \prog{mu <- rep(NA,r)} \\ \> \prog{S <- rep(NA,r)} \\ \> \prog{n <- 100; xbar <- 89; SS <- (n-1)*30} \\ \> \prog{mu0 <- 85; S0 <- 350; nu0 <- 4; n0 <- 1} \\ \> \prog{phi0 <- S0/(n0*(nu0-2))} \\ \> \prog{S[1] <- S0} \\ \> \prog{nustar <- nu0 + n} \\ \> \prog{for (i in 2:r) \{} \\ \> \> \prog{phi[i] <- (1/phi0 + n*nustar/S[i-1])\^{}\{-1\}} \\ \> \> \prog{mu[i] <- phi[i]*(mu0/phi0 + n*xbar*nustar/S[i-1])} \\ \> \> \prog{S[i] <- S0 + (SS+n*xbar\^{}2) - 2*n*xbar*mu[i+1] +} \\ \> \> \> \> \> \> \> \> \> \prog{n*(mu[i]\^{}2 + phi[i])} \\ \> \> \prog{cat("i",i,"phi",phi[i],"mu",mu[i],"S",S[i],"$\backslash$n")} \\ \> \prog{\}} \\ \> \prog{mustar <- mu[r]; phistar <- phi[r]; Sstar <- S[r]} \\ \> \prog{cat("mu has mean",mustar,"and s.d.",sqrt(phistar),"$\backslash$n")} \\ \> \prog{cat("phi has mean",Sstar/(nustar-2),"and s.d.",} \\ \> \> \> \> \prog{(Sstar/(nustar-2))*sqrt(2/(nustar-4)),"$\backslash$n")} \\ \end{tabbing} We get convergence to $\mu=88.993$ and $\phi=0.322 $. \nextq In the discrete case the Kullback-Leibler divergence is \begin{align*} \I(q:p) &= \sum q(x) \log\{q(x)/p(x)\} \\ &= \sum \binom{n}{x}\rho^x(1-\rho^{n-x}) \log \left\{\binom{n}{x}\rho^x(1-\rho^{n-x})\left/ \binom{n}{x}\pi^x(1-\pi^{n-x})\right.\right\} \\ &= \sum \binom{n}{x}\rho^x(1-\rho^{n-x}) \left\{x\log(\rho/\pi)+(n-x)\log[(1-\rho)/(1-\pi)]\right\} \\ &= n\rho\log(\rho/\pi) + n(1-\rho)\log[(1-\rho)/(1-\pi)] \end{align*} using the fact that the mean of $\B(n,\rho)$ is $n\rho$. For $\I(q:p)=\I(p:q)$ we need {\small \[ n\rho\log(\rho/\pi) + n(1-\rho)\log[(1-\rho)/(1-\pi)] = n\pi\log(\pi/rho) + n(1-\pi)\log[(1-\pi)/(1-\rho)] \] } so \[ (\pi-\rho)\log\left\{\frac{\pi}{1-\pi}\left/ \frac{\rho}{1-\rho}\right.\right\} \] from which it is clear that this can happen if and only if $\pi=\rho$ (when $\I(q:p)=\I(p:q)=0$). \nextq In the continuous case the Kullback-Leibler divergence is \begin{align*} \I(q:p) &= \int q(x) \log\{q(x)/p(x)\}\dx \\ &= \int (2\pi\psi)^{-1/2}\exp\{-\half(x-\nu)^2/\psi\}\times \\ &\quad\left\{\log\left[ (2\pi\psi)^{-1/2}\exp\{-\half(x-\nu)^2/\psi\}\right]\right. \\ &\qquad\left.-\log\left[ (2\pi\phi)^{-1/2}\exp\{-\half(x-\mu)^2/\phi\}\right]\right\} \\ &= \half\log(\psi/\phi)-\E(x-\nu)^2/\phi+\E(x-\mu)^2/\phi \end{align*} where $x\sim\N(\nu,\phi)$. By writing $(x-\mu)^2=\{(x-\nu)+(\nu-\mu)\}^2$ it is easily concluded that \[ 2\I(q:p) = \log(\phi/\psi)+(\phi-\psi)/\psi+(\nu-\mu)^2/\phi. \] In particular if $\phi=\psi$ then \[ \I(q:p) = \half(\nu-\mu)^2/\phi. \] \nextq We find \begin{align*} \I(q:p) &= \int q(x) \log\{q(x)/p(x)\}\dx \\ &= \int_0^{\infty} \beta^{-1}\exp(-x/\beta) \log\left\{\left(\beta^{-1}\exp(-x/\beta)\right)\left/ 2(2\pi)^{-1/2}\exp(-\half x^2)\right.\right\} \\ &= \half\log(8\pi)-\log\beta - \E x/\beta +\half\E x^2 \end{align*} where $x \sim \Ex(\beta)$. It follows that \[ \I(q:p) = \half\log(8\pi)-\log\beta - 1 + \beta^2 \] and hence \begin{align*} \frac{\partial\I(q:p)}{\partial\beta} &= -\frac{1}{\beta} + 2\beta \\ \frac{\partial^2\I(q:p)}{\partial\beta^2} &=\frac{1}{\beta^2} + 2 \end{align*} It follows that $\I(q:p)$ is a minimum when $\beta=\sqrt{2}$. \nextq See the paper by Corduneaunu and Bishop (2001). \nextq The model is \[ (\quarter+\quarter\eta,\,\quarter\eta,\,\quarter(1-\eta),\, \quarter(1-\eta)+\quarter) \] and the values quoted are $x_1=461$, $x_2=130$, $x_3=161$ and $x_4=515$. An \R\ program for the \ABCREJ\ algorithm is \begin{tabbing} \hspace*{2ex} \=\hspace{2ex}\=\hspace{2ex}\=\hspace{2ex}\=\hspace{2ex}\=\hspace{2ex}\=\hspace{2ex}\=\hspace{2ex}\=\hspace{2ex}\=\hspace{2ex}\=\hspace{2ex}\kill \> \prog{N <- 100000} \\ \> \prog{etastar <- c(461,130,161,515)} \\ \> \prog{n <- sum(etastar)} \\ \> \prog{eps <- 22} \\ \> \prog{d <- rep(NA,N)} \\ \> \prog{trial <- rep(NA,N)} \\ \> \prog{for (j in 1:N) \{} \\ \> \prog{etatrial <- runif(1)} \\ \> \> \prog{inds <- sample(4,n,replace=T,p=c(0.25+etatrial/4,etatrial/4,} \\ \> \> \> \> \> \> \> \> \prog{(1-etatrial)/4,(1-etatrial)/4+0.25))} \\ \> \> \prog{samp <- c(sum(inds==1),sum(inds==2),sum(inds==3),sum(inds==4))} \\ \> \> \prog{d <- sqrt(sum((etastar-samp)\^{}2))} \\ \> \> \prog{if (d <= eps) trial[j] <- etatrial} \\ \> \prog{\}} \\ \> \prog{eta <- trial[!is.na(trial)]} \\ \> \prog{k <- length(eta)} \\ \> \prog{m <- mean(eta)} \\ \> \prog{s <- sd(eta)} \\ \> \prog{cat("k",k,"m",m,"s",s,"$\backslash$n")} \end{tabbing} Similar modifications can be made for the other algorithms. \nextq Let $M_i$ denote model $i$ ($i = 1$, 2, 3, 4) with model parameters $\theta_i$, where $\theta_1=(\alpha,\tau)$, $\theta_2 = (\alpha,\beta,\tau)$, $\theta_3 = (\alpha,\lambda,\tau)$ and $\theta_4 = (\alpha,\beta,\lambda,\tau)$. \medskip \noindent The reversible jump MCMC algorithm is as follows: \begin{itemize} \item Initiate the algorithm by choosing an initial model, $m$ to start in and initial parameter values $\theta_m$. \item Each iteration $t$: \begin{itemize} \item Update the model parameters $\theta_{m_{t-1}}$, where $m_{t-1}$ is the model which the algorithm is residing in at the end of iteration $t - 1$. \item Propose to move to a new model $M_t'$ with parameter values $\theta'$. Calculate the acceptance probability for the move and decide whether to accept or reject the new model (and parameter values). \end{itemize} \end{itemize} The models are nested with model 4 being the full model with sub-models model 2 and model 3 and the simplest model 1. Each model either includes or excludes $\beta$ and $\lambda$ (exclusion of a parameter is equivalent to setting it equal to 0). \medskip \noindent Given that the current model $j$, employ the following reversible jump: \medskip \noindent Choose randomly either $\beta$ and $\lambda$, and propose to move to model $i$ has the chosen parameter included (excluded) if the parameter is excluded (included) in model $j$. \medskip \noindent A bijection is required for transforming between the models. Therefore for inclusion of a parameter propose $u \sim \N(0, \sigma^2)$ for suitably chosen $\sigma^2$ and propose to set the parameter equal to $u$. (This works best if $x_i$ and $t_i$ have been standardised to have mean 0 and variance 1.) Leave the other parameters unchanged. Thus the Jacobian of the transformation is simply 1. \medskip \noindent Let $L_j(\theta_j)$ denote the likelihood of $\theta_j$ given model $M_j$. Then the acceptance probability for the inclusion of a parameter (either $\beta$ or $\tau$ set equal to $u$) is \[ \min\left\{1,\ \frac{L_i(\theta_i)}{L_j(\theta_j)}\times \frac{\pi(\theta_i,M_i)}{\pi(\theta_j,M_j)}\times\frac{1/2}{1/2} \times\frac{1}{\exp(-\half u^2/\sigma^2)/\sqrt{2\pi\sigma^2}} \right\}. \] \end{document} %