% LaTeX source for Answers to Bayesian Statistics: An Introduction (4th edn)
\documentclass[oneside]{book}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsbsy}
\usepackage{makeidx}
\usepackage{epsf}
\usepackage{euscript}
\setcounter{secnumdepth}{1}
\setcounter{tocdepth}{1}
% Set up environment for exercises at ends of chapters
\newcounter{qno}
\newcommand{\startqs}{\setcounter{qno}{0}\vspace{-1.5\baselineskip}}
\newcommand{\nextq}
{\vspace{1.5\baselineskip}\noindent\addtocounter{qno}{1}\arabic{qno}.\quad}
% Allow for blank lines
\newcommand{\blankline}{\vspace{\baselineskip}\noindent}
% Define digitwidth and dotwidth (TeXbook p. 241)
\newdimen\digitwidth
\setbox0=\hbox{\rm0}
\digitwidth=\wd0
\newdimen\dotwidth
\setbox0=\hbox{\rm.}
\dotwidth=\wd0
% Notation for vectors, matrices, estimates, random variables and sample means
\newcommand{\vect}{\boldsymbol}
\newcommand{\matr}{\boldsymbol}
\newcommand{\est}{\widehat}
\newcommand{\random}{\widetilde}
\newcommand{\mean}{\overline}
% Notation for dots in subscripts
\newcommand {\bdot}{\hbox{\Huge .}}
\newcommand {\dotdot}{{\hbox{\Huge .}\kern-0.1667em\hbox{\Huge .}}}
\newcommand {\onedot}{1\kern-0.1667em\bdot}
\newcommand {\twodot}{2\kern-0.1667em\bdot}
\newcommand {\idot}{i\kern-0.1667em\bdot}
\newcommand {\jdot}{j\kern-0.1667em\bdot}
\newcommand {\mdot}{m\kern-0.1667em\bdot}
\newcommand {\dotj}{\kern-0.1667em\bdot\kern-0.1667em j}
% Define sech, arc sin and arc cos
\newcommand{\sech}{\operatorname{sech}}
\renewcommand{\arcsin}{\operatorname{arc\,sin}}
\renewcommand{\arccos}{\operatorname{arc\,cos}}
% Define Probability, Expectation, Variance, Covariance, Median, Mode
\renewcommand{\Pr}{\mbox{$\mathsf P$}}
\newcommand{\E}{\mbox{$\mathsf E$}}
\newcommand{\Var}{\mbox{$\mathcal V$}}
\newcommand{\Cov}{\mbox{$\mathcal C$}}
\newcommand{\median}{\mbox{median\,}}
\newcommand{\mode}{\mbox{mode\,}}
\newcommand{\MCMC}{\mbox{\text{MCMC}}}
% Define notation for evidence
\newcommand{\Ev}{\mbox{Ev}}
% Notation for the R project
\newcommand{\R}{\textsf{R}}
% Script I for a (Kullback-Leibler) information measure
\newcommand{\I}{\mbox{$\EuScript I$}}
% Define small common fractions for use in display formulae
\newcommand{\half}{\mbox{$\frac{1}{2}$}}
\newcommand{\smallhalf}{\mbox{\small$\frac{1}{2}$}}
\newcommand{\quarter}{\mbox{$\frac{1}{4}$}}
\newcommand{\threequarters}{\mbox{$\frac{3}{4}$}}
\newcommand{\third}{\mbox{$\frac{1}{3}$}}
\newcommand{\twothirds}{\mbox{$\frac{2}{3}$}}
\newcommand{\ninth}{\mbox{$\frac{1}{9}$}}
\newcommand{\twofifths}{\mbox{$\frac{2}{5}$}}
% Alternative notation for fractions (TeXbook, exercise 11.6)
\newcommand{\slopefrac}[2]{\leavevmode\kern.1em
\raise .5ex\hbox{\the\scriptfont0 #1}\kern-.1em
/\kern-.15em\lower .25ex\hbox{\the\scriptfont0 #2}}
% Notation for beta function
\newcommand{\Betafn}{\mbox{B}}
% Define names of distributions
\newcommand{\N}{\mbox{N}} % A.1
\newcommand{\G}{\mbox{G}} % A.4
\newcommand{\Ex}{\mbox{E}} % A.4
\renewcommand{\t}{\mbox{t}} % A.8
\newcommand{\Be}{\mbox{Be}} % A.10
\newcommand{\B}{\mbox{B}} % A.11
\renewcommand{\P}{\mbox{P}} % A.12
\newcommand{\NB}{\mbox{NB}} % A.13
\renewcommand{\H}{\mbox{H}} % A.14
\newcommand{\U}{\mbox{U}} % A.15
\newcommand{\UD}{\mbox{UD}} % A.15
\newcommand{\Pa}{\mbox{Pa}} % A.16
\newcommand{\Pabb}{\mbox{Pabb}} % A.16
\newcommand{\M}{\mbox{M}} % A.17
\newcommand{\BF}{\mbox{BF}} % A.18
\newcommand{\F}{\mbox{F}} % A.19
\newcommand{\z}{\mbox{z}} % A.20
\newcommand{\C}{\mbox{C}} % A.21
% Define some common bold symbols
\newcommand{\balpha}{\mbox{$\boldsymbol\alpha$}}
\newcommand{\bbeta}{\mbox{$\boldsymbol\beta$}}
\newcommand{\beeta}{\mbox{$\boldsymbol\eta$}}
\newcommand{\btheta}{\mbox{$\boldsymbol\theta$}}
\newcommand{\bkappa}{\mbox{$\boldsymbol\kappa$}}
\newcommand{\blambda}{\mbox{$\boldsymbol\lambda$}}
\newcommand{\bmu}{\mbox{$\boldsymbol\mu$}}
\newcommand{\bnu}{\mbox{$\boldsymbol\nu$}}
\newcommand{\bpi}{\mbox{$\boldsymbol\pi$}}
\newcommand{\btau}{\mbox{$\boldsymbol\tau$}}
\newcommand{\bzero}{\mbox{$\boldsymbol0$}}
\newcommand{\BOmega}{\mbox{$\boldsymbol\Omega$}}
% Further bold symbols for use in connection with hierarchical models
\newcommand {\bpiem}{\mbox{\boldmath $\pi^{EM}$}}
\newcommand {\bhtheta}{\mbox{\boldmath $\est\theta$}}
\newcommand {\bhthetao}{\mbox{\boldmath $\est\theta^{\mbox{\scriptsize\it0}}$}}
\newcommand {\bhthetajs}{\mbox{\boldmath $\est\theta^{JS}$}}
\newcommand {\bhthetajsplus}{\mbox{\boldmath $\est\theta^{JS^{{}_+}}$}}
\newcommand {\bhthetaem}{\mbox{\boldmath $\est\theta^{EM}$}}
\newcommand {\bhthetab}{\mbox{\boldmath $\est\theta^{B}$}}
\newcommand {\bhthetaeb}{\mbox{\boldmath $\est\theta^{EB}$}}
\newcommand {\thetabar}{\mbox{$\mean\theta$}}
\newcommand {\bphi}{\mbox{\boldmath $\phi$}}
\newcommand {\BPhi}{\mbox{\boldmath $\Phi$}}
\newcommand {\bpsi}{\mbox{\boldmath $\psi$}}
\newcommand {\BPsi}{\mbox{\boldmath $\Psi$}}
\newcommand {\BSigma}{\mbox{\boldmath $\Sigma$}}
% Define transpose for matrix theory
\newcommand{\transpose}{\mbox{${}^{\text{T}}$}}
% Define differentials with roman d and thin space before
\renewcommand{\d}{\mbox{d}}
\newcommand{\dF}{\,\mbox{\d$F$}}
\newcommand{\dt}{\,\mbox{\d$t$}}
\newcommand{\du}{\,\mbox{\d$u$}}
\newcommand{\dU}{\,\mbox{\d$U$}}
\newcommand{\dx}{\,\mbox{\d$x$}}
\newcommand{\dy}{\,\mbox{\d$y$}}
\newcommand{\dz}{\,\mbox{\d$z$}}
\newcommand{\dgamma}{\,\mbox{\d$\gamma$}}
\newcommand{\dzeta}{\,\mbox{\d$\zeta$}}
\newcommand{\deta}{\,\mbox{d$\eta$}}
\newcommand{\dtheta}{\,\mbox{\d$\theta$}}
\newcommand{\dbtheta}{\,\mbox{\d$\boldsymbol\theta$}}
\newcommand{\dkappa}{\,\mbox{\d$\kappa$}}
\newcommand{\dlambda}{\,\mbox{\d$\lambda$}}
\newcommand{\dLambda}{\,\mbox{\d$\Lambda$}}
\newcommand{\dmu}{\,\mbox{\d$\mu$}}
\newcommand{\dbmu}{\,\mbox{\d$\bmu$}}
\newcommand{\drho}{\,\mbox{\d$\rho$}}
\newcommand{\dpi}{\,\mbox{\d$\pi$}}
\newcommand{\dxi}{\,\mbox{\d$\xi$}}
\newcommand{\dphi}{\,\mbox{\d$\phi$}}
\newcommand{\dpsi}{\,\mbox{\d$\psi$}}
\newcommand{\domega}{\,\mbox{\d$\omega$}}
% Hyp for hypothesis
\newcommand{\Hyp}{\mbox{H}}
% Blackboard bold Z for the integers
\newcommand{\Z}{\mbox{$\mathbb Z$}}
% Script X for a set of possible observations
\newcommand{\X}{\mbox{$\mathcal X$}}
% EM, GEM, E-step and M-step for the EM algorithm
\newcommand{\EM}{\mbox{\textit{EM}\ }}
\newcommand{\GEM}{\mbox{\textit{GEM}\ }}
\newcommand{\Estep}{\mbox{\textit{E}-step\ }}
\newcommand{\Mstep}{\mbox{\textit{M}-step\ }}
% Omit the word Chapter at the start of chapters
\renewcommand{\chaptername}{}
\begin{document}
\pagestyle{plain}
\appendix
\setcounter{chapter}{8}
\section{Exercises on Chapter \arabic{section}}
\setcounter{qno}{0}
\nextq A card came is played with 52 cards divided equally between four
players, North, South, East and West, all arrangements being
equally likely. Thirteen of the cards are referred to as trumps.
If you know that North and South have ten trumps between them,
what is the probability that all three remaining trumps are in the
same hand? If it is known that the king of trumps is included
among the other three, what is the probability that one player has
the king and the other the remaining two trumps?
\nextq\!\!\!\!(a) Under what circumstances is an event $A$ independent
of itself?
\begin{description}
\item[\quad(b)] By considering events concerned with independent
tosses of a red die and a blue die, or otherwise. give examples
of events $A$, $B$ and $C$ which are not independent, but
nevertheless are such that every pair of them is independent.
\item[\quad(c)]By considering events concerned with three independent
tosses of a coin and supposing that $A$ and $B$ both represent
tossing a head on the first trial, give examples of events $A$,
$B$ and $C$ which are such that $\Pr(ABC)=\Pr(A)\Pr(B)\Pr(C)$
although no pair of them is independent.
\end{description}
\nextq Whether certain mice
are black or brown depends on a pair of genes,
each of which is either $B$ or $b$. If both members of the pair
are alike, the mouse is said to be homozygous, and if they are
different it is said to be heterozygous. The mouse is brown
only if it is homozygous
$bb$. The offspring of a pair of mice have two such genes, one
from each parent, and if the parent is heterozygous, the
inherited gene is equally
likely to be $B$ or $b$. Suppose that a black mouse results from a
mating between two heterozygotes.
\begin{description}
\item[\quad(a)] What are the probabilities that this mouse is homozygous
and that it is heterozygous?
\end{description}
Now suppose that this mouse is mated with a brown mouse, resulting
in seven offspring, all of which turn out to be black.
\begin{description}
\item[\quad(b)] Use Bayes' Theorem to find the probability that the
black mouse was homozygous $BB$.
\item[\quad(c)] Recalculate the same probability by regarding the seven
offspring as seven observations made sequentially, treating the
posterior after each observation as the prior for the next (cf.\
Fisher, 1959, Section II.2).
\end{description}
\nextq The example on Bayes' Theorem in Section 1.2
concerning the biology of twins
was based on the assumption that births of boys and girls occur
equally frequently, and yet it has been known for a very long time
that fewer girls are born than boys (cf.\ Arbuthnot, 1710).
Suppose that the probability of a girl is $p$, so that
\[
\begin{array}{lll}
\Pr(GG|M)=p, &\Pr(BB|M)=1 - p, &\Pr(GB|M)=0,
\\
\Pr(GG|D)=p^2, &\Pr(BB|D)=(1 - p)^2, &\Pr(GB|D)=2p(1 - p).
\end{array}
\]
Find the proportion of monozygotic twins in the whole population
of twins in terms of $p$ and the sex distribution among all twins.
\nextq Suppose a red and a blue die are tossed. Let $x$ be the sum of
the number showing on the red die and twice the number showing on
the blue
die. Find the density function and the distribution function of
$x$.
\nextq Suppose that $k\sim\B(n,\pi)$ where $n$ is large and $\pi$ is
small but $n\pi=\lambda$ has an intermediate value. Use the
exponential
limit $(1+x/n)^n\to\text{e}^x$ to show that
$\Pr(k=0)\cong \text{e}^{-\lambda}$ and
$\Pr(k=1)\cong \lambda\text{e}^{-\lambda}$. Extend this result to
show that $k$ is such that
\[ p(k) \cong \frac{\lambda^k}{k!}\exp(-\lambda) \]
that is, $k$ is approximately distributed as a Poisson variable of
mean $\lambda$ (cf.\ Appendix A).
\nextq Suppose that $m$ and $n$ have independent Poisson distributions
of means $\lambda$ and $\mu$ respectively (see question
6 and that $k=m+n$.
has a chi-squared density
on one degree of freedom as defined in Appendix A.
\nextq Modify the formula for the density of a one-to-one funtion $g(x)$
of a random variable $x$ to find an expression for the density of
$x^2$ in terms of that of $x$, in both the continuous and discrete
case. Hence show that the square of a standard normal distribution
has a chi-squared distribution on one degree of freedom as defined
in Appendix A.
\nextq Suppose that $x_1, x_2, \dots, x_n$ are independently and all
have the
same continuous distribution, with density $f(x)$ and distribution
function $F(x)$. Find the distribution functions of
\[ M = \max \{x_1, x_2, \dots, x_n\} \quad\text{and}\quad
m = \min \{x_1, x_2, \dots, x_n\} \]
in terms of $F(x)$, and so find expressions for the density
functions of $M$ and $m$.
\nextq Suppose that $u$ and $v$ are independently uniformly distributed
on the interval [0, 1], so that the divide the interval into three
sub-intervals. Find the joint density function of the lengths of
the first two sub-intervals.
\nextq Show that two continuous random variables $x$ and $y$ are
independent (that is, $p(x, y)=p(x)p(y)$ for all $x$ and $y$) if
and only if their joint distribution function $F(x, y)$ satisfies
$F(x, y)=F(x)F(y)$ for all $x$ and $y$. Prove that the same thing
is true for discrete random variables. [This is an example of a
result which is easier to prove in the continuous case.]
\nextq Suppose that the random variable $x$ has a negative binomial
distribution
$\NB(n, \pi)$ of index $n$ and parameter $\pi$, so that
\[ p(x) = \binom{n+x-1}{x} \pi^n (1 - \pi)^x \]
Find the mean and variance of $x$ and check that your answer agrees
with that given in Appendix A.
\nextq A random variable $X$ is said to have a chi-squared distribution
on $\nu$ degrees of freedom if it has the same distribution as
\[ Z_1^2+Z_2^2+\dots+Z_{\nu}^2 \]
where $Z_1$, $Z_2$, $\dots$, $Z_{\nu}$ are independent standard
normal variates. Use the facts that $\E Z_i=0$, $\E Z_i^2=1$ and
$\E Z_i^4=3$ to find the mean and variance of $X$. Confirm these
values using the probability density of $X$, which is
\[ p(X)=\frac{1}{2^{\nu/2}\Gamma(\nu/2)}X^{\nu/2-1}\exp(-\half X)
\qquad(0 < X < \infty) \]
(see Appendix A).
\nextq The \textit{skewness} of a random variable $x$ is defined as
$\gamma_1 = \mu_3/(\mu_2)^{\frac{3}{2}}$ where
\[ \mu_n = \E (x - \E x)^n \]
(but note that some authors work in terms of $\beta_1 =
\gamma_1^2$).
Find the skewness of a random variable $X$ with a binomial
distribution $B(n, \pi)$ of index $n$ and parameter $\pi$.
\nextq Suppose that a continuous random variable $X$ has mean $\mu$ and
variance $\phi$. By writing
\[ \phi = \int (x-\mu)^2 p(x)\dx \geqslant
\int_{\{x;\,|x-\mu|\geqslant c\}} (x-\mu)^2 p(x)\dx \]
and using a lower bound for the integrand in the latter integral,
prove that
\[ \Pr(|x-\mu|\geqslant c)\leqslant\frac{\phi}{c^2}. \]
Show that the result also holds for discrete random variables.
[This result is known as \v Ceby\v sev's Inequality (the name is
spelt in many other ways, including Chebyshev and Tchebycheff).]
\nextq Suppose that $x$ and $y$ are such that
\[ \Pr(x=0, y=1)=\Pr(x=0, y=-1)=\Pr(x=1, y=0)=\Pr(x=-1,
y=0)=\quarter. \]
Show that $x$ and $y$ are uncorrelated but that they are
\textit{not} independent.
\nextq Let $x$ and $y$ have a bivariate normal distribution
and suppose that
$x$ and $y$ both have mean 0 and variance 1, so that their marginal
distributions are standard normal and their joint density is
\[ p(x, y) = \left\{2\pi\sqrt{(1 - \rho^2)}\right\}^{-1}
\exp \left\{- \half(x^2 - 2\rho xy + y^2)/(1 - \rho^2) \right\}.
\]
Show that if the correlation coefficient between $x$ and $y$ is
$\rho$, then that between $x^2$ and $y^2$ is $\rho^2$.
\nextq Suppose that $x$ has a Poisson distribution
(see question 6) $\P(\lambda)$ of mean
$\lambda$ and that, for given $x$, $y$ has a binomial distribution
$\B(x, \pi)$ of index $x$ and parameter $\pi$.
\begin{description}
\item[\quad(a)] Show that the unconditional distribution of $y$ is
Poisson of mean
\[ \lambda\pi = \E_{\random x}
\E_{\random y|\random x}(\random y|\random x). \]
\item[\quad(b)] Verify that the formula
\[ \Var\,\random y =
\E_{\random x}\Var_{\random y|\random x}(\random y|\random x)
+\Var_{\random x}\E_{\random y|\random x}(\random y|\random x)
\]
derived in Section 1.5 holds in this case.
\end{description}
\nextq Define
\[ I=\int_{0}^{\infty}\exp(-\half z^2)\,dz \]
and show (by setting $z=xy$ and then substituting $z$ for $y$) that
\[ I=\int_{0}^{\infty}\exp(-\half(xy)^2)\,y\,dx
=\int_{0}^{\infty}\exp(-\half(zx)^2)\,z\,dx. \]
Deduce that
\[ I^2=\int_{0}^{\infty}\int_{0}^{\infty}
\exp\{-\half(x^2+1)z^2\}\,z\,dz\,dx. \]
By substituting $(1+x^2)z^2=2t$ so that $z\,dz=dt/(1+x^2)$ show
that $I=\sqrt{\pi/2}$ so that the density of the standard normal
distribution as defined in Section 1.3 does integrate to unity
and so is indeed a density. (This method is due to Laplace, 1812,
Section 24.)
\section{Exercises on Chapter \arabic{section}}
\setcounter{qno}{0}
\nextq Suppose that $k\sim\B(n,\pi)$. Find the standardized likelihood
as a function of $\pi$ for given $k$. Which of the distributions
listed in Appendix A does this represent?
\nextq Suppose we are given the twelve observations from a normal
distribution:
\begin{center}
15.644,\ \ \ 16.437,\ \ \ 17.287,\ \ \ 14.448,\ \ \ 15.308,\ \ \
15.169, \\
18.123,\ \ \ 17.635,\ \ \ 17.259,\ \ \ 16.311,\ \ \ 15.390,\ \ \
17.252. \\
\end{center}
and we are told that the variance $\phi = 1$. Find a 90\% HDR for
the posterior distribution of the mean assuming the usual reference
prior.
\nextq With the same data as in the previous question, what is the
predictive distribution for a possible future observation $x$?
\nextq A random sample of size $n$ is to be taken from an
$\N(\theta,\phi)$
distribution where $\phi$ is known. How large must $n$ be to
reduce the posterior variance of $\phi$ to the fraction $\phi/k$
of its original value (where $k > 1$)?
\nextq Your prior beliefs about a quantity $\theta$ are such that
\[ p(\theta) = \left\{\begin{array}{ll} 1 & (\theta \geqslant 0)
\\
0 & (\theta < 0).
\end{array}\right.
\]
A random sample of size 25 is taken from an $\N(\theta, 1)$
distribution
and the mean of the observations is observed to be 0.33. Find a
95\% HDR for $\theta$.
\nextq Suppose that you have prior beliefs about an unknown quantity
$\theta$ which can be approximated by an $\N(\lambda, \phi)$
distribution, while my beliefs can be approximated by an
$\N(\mu,\psi)$ distribution. Suppose further that the reasons
that have led us to these conclusions do not overlap with one
another. What distribution should represent our beliefs about
$\theta$ when we take into account
all the information available to both of us?
\nextq Prove the theorem quoted without proof in Section
2.4.
\nextq Under what circumstances can a likelihood arising from a
distribution in the exponential family be expressed in data
translated form?
\nextq Suppose that you are interested in investigating how variable the
performance of schoolchildren on a new mathematics test, and that
you begin by trying this test out on children in twelve similar
schools. It turns out that the average standard deviation is
about 10 marks. You then want to try the test on a thirteenth
school, which is fairly
similar to those you have already investigated, and you reckon that
the data on the other schools gives you a prior for the variance in
this new school which has a mean of 100 and is worth 8 direct
observations on the school. What is the posterior distribution for
the variance if you then observe a sample of size 30 from the
school of which the standard deviation is 13.2? Give an interval
in which the variance lies with 90\% posterior probability.
\nextq The following are the dried weights of a number of plants (in
grammes) from a batch of seeds:
\begin{center}
4.17,\ \ 5.58,\ \ 5.18,\ \ 6.11,\ \ 4.50,\ \ 4.61,\ \ 5.17,\ \
4.53,\ \ 5.33,\ \ 5.14.
\end{center}
Give 90\% HDRs for the mean and variance of the population from
which they come.
\nextq Find a sufficient statistic
for $\mu$ given an $n$-sample
$\vect x = (x_1, x_2, \dots, x_n)$ from the exponential
distribution
\[ p(x|\mu) = \mu^{-1}\exp (- x/\mu)\qquad(0 < x < \infty )
\]
where the parameter $\mu$ can take any value in $0 < \mu < \infty$.
\nextq Find a (two-dimensional) sufficient statistic
for $(\alpha, \beta)$
given an $n$-sample $\vect x = (x_1, x_2, \dots, x_n)$ from the
two-parameter gamma distribution
\[ p(x|\alpha,\beta)=\{\beta^\alpha\Gamma(\alpha)\}^{-1}
x^{\alpha-1}\exp (- x/\beta)\qquad(0 < x < \infty) \]
where the parameters $\alpha$ and $\beta$ can take any values in
$0 < \alpha < \infty$, $0 < \beta < \infty$.
\nextq Find a family of conjugate priors for the likelihood
$l(\beta|x) = p(x\,|\,\alpha,\beta)$ where $p(x\,|\,\alpha,\beta)$
is as in the previous question, but $\alpha$ is known.
\nextq Show that the tangent of a random angle (that is, one which is
uniformly distributed on $[0, 2\pi)$) has a Cauchy distribution
$\C(0,1)$.
\nextq Suppose that the vector $\vect x = (x, y, z)$ has a trinomial
distribution
depending on the index $n$ and the parameter
$\bpi = (\pi,\rho,\sigma)$ where $\pi+\rho+\sigma=1$, that is
\[ p(x|\bpi) =
\frac{n!}{x!\,y!\,z!}\pi^x\rho^y\sigma^z\qquad(x+y+z=n).
\]
Show that this distribution is in the two-parameter exponential
family.
\nextq Suppose that the results of a certain test are known, on the
basis
of general theory, to be normally distributed about the same mean
$\mu$ with the same variance $\phi$, neither of which is known.
Suppose further that your prior beliefs about $(\mu, \phi)$ can be
represented by a normal/chi-squared distribution with
\[ \nu_0 = 4,\qquad S_0 = 350,\qquad n_0 = 1,\qquad \theta_0 =
85. \]
Now suppose that 100 observations are obtained from the population
with mean 89 and sample variance $s^2 = 30$. Find the posterior
distribution of $(\mu, \phi)$. Compare 50\% prior and posterior
HDRs for $\mu$.
\nextq Suppose that your prior for $\theta$ is a $\twothirds:\third$
mixture of $\N(0,1)$ and $\N(1,1)$ and that a single observation
$x\sim\N(\theta, 1)$ turns out to equal 2. What is your posterior
probability that $\theta>1$?
\nextq Establish the formula
\[ (n_0^{-1}+n^{-1})^{-1}(\mean x-\theta_0)^2=
n\mean x^2+n_0\theta_0^2-n_1\theta_1^2
\]
where $n_1=n_0+n$ and $\theta_1=(n_0\theta_0+n\mean x)/n_1$, which
was quoted in Section 2.13 as providing a formula for the parameter
$S_1$ of the posterior distribution in the case where both mean and
variance are unknown which is less susceptible to rounding errors.
\section{Exercises on Chapter \arabic{section}}
\setcounter{qno}{0}
\nextq Laplace
claimed that the probability that an event which has occurred
$n$ times, and has not hitherto failed, will occur again is
$(n + 1)/(n + 2)$ [see Laplace (1774)], which is sometimes known
as \textit{Laplace's rule of succession}. Suggest grounds for
this assertion.
\nextq Find a suitable interval of 90\% posterior probability to quote
in a case when your posterior distribution for an unknown parameter
$\pi$ is $\Be(20, 12)$,
and compare this interval with similar
intervals for the cases of $\Be(20.5, 12.5)$ and $\Be(21, 13)$
posteriors. Comment on the relevance of the results to the choice
of a reference prior for the binomial distribution.
\nextq Suppose that your prior beliefs about the probability $\pi$ of
success in Bernoulli trials have mean $1/3$ and variance $1/32$.
Give a 95\% posterior HDR for $\pi$ given that you have observed
8 successes in 20 trials.
\nextq Suppose that you have a prior distribution for the probability
$\pi$ of success in a certain kind of gambling game which has mean
0.4, and that you regard your prior information as equivalent to 12
trials. You then play the game 25 times and win 12 times. What
is your posterior distribution for $\pi$?
\nextq Suppose that you are interested in the proportion of females in a
certain organisation and that as a first step in your investigation
you intend to find out the sex of the first 11 members on the
membership list. Before doing so, you have prior beliefs which you
regard as equivalent to 25\% of this data, and your prior beliefs
suggest that a third of the membership is female.
Suggest a suitable prior distribution and find its standard
deviation.
Suppose that 3 of the first 11 members turn out to be female; find
your posterior distribution and give a 50\% posterior HDR for this
distribution.
Find the mean, median and mode of the posterior distribution.
Would it surprise you to learn that in fact 86 of the total number
of 433 members are female?
\nextq Show that if $g(x) = \sinh^{-1} \sqrt{(x/n)}$ then
\[ g'(x) = \half n^{-1} [(x/n)\{1 +
(x/n)\}]^{-\frac{1}{2}}. \]
Deduce that if $x \sim \NB(n, \pi)$ has a negative binomial
distribution
of index n and parameter $\pi$ and $z = g(x)$ then
$\E z \cong \sinh^{-1} \sqrt{(x/n)}$ and $\Var z \cong 1/4 n$.
What does this suggest as a reference prior for $\pi$?
\nextq The following data were collected by von Bortkiewicz (1898) on
the number of men killed by a horse
in certain Prussian army corps in
twenty years, the unit being one army corps for one year:
\begin{center}
\begin{tabular}{lrrrrrl}
Number of deaths: &0 &1 &2 &3 &4 &5 and more
\\
Number of units: &144 &91 &32 &11 &2 &0.
\end{tabular}
\end{center}
Give an interval in which the mean number $\lambda$ of such deaths
in a particular army corps in a particular year lies with 95\%
probability.
\nextq Recalculate the answer to the previous question assuming that you
had a prior distribution for $\lambda$ of mean 0.66 and standard
deviation 0.115.
\nextq Find the Jeffreys prior for the parameter $\alpha$ of the
Maxwell distribution
\[ p(x|\alpha)=\sqrt{\frac{2}{\pi}}\alpha^{3/2}x^2\exp(-\half\alpha x^2)
\]
and find a transformation of this parameter in which the
corresponding prior is uniform.
\nextq Use the two-dimensional version of Jeffreys' rule to determine a
prior for the trinomial distribution
\[ p(x, y, z|\pi,\rho)\propto\pi^x\rho^y(1-\pi-\rho)^z. \]
(cf.\ question 15 on Chapter 2).
\nextq Suppose that $x$ has a Pareto distribution
$\Pa(\xi,\gamma)$ where $\xi$
is known but $\gamma$ is unknown, that is,
\[ p(x|\gamma) = \gamma\xi^\gamma x^{-\gamma-1} I_{(\xi,\infty)}(x).
\]
Use Jeffreys' rule
to find a suitable reference prior for $\gamma$.
\nextq Consider a uniform distribution
with $\gamma = 2$. How large a random sample must
be taken from the uniform distribution in order that the
coefficient of variation (that is, the standard deviation
divided by the mean) of
the length $\beta - \alpha$ of the interval should be reduced to
0.01 or less?
\nextq Suppose that observations $x_1$, $x_2$, $\dots$, $x_n$ are
available from a density
\[ p(x|\theta)=(c+1)\theta^{-(c+1)}x^c\qquad(0 < x < \theta). \]
Explain how you would make inferences about the parameter $\theta$
using a conjugate prior.
\nextq What could you conclude if you observed \textit{two} tramcars
numbered, say, 71 and 100?
\nextq In Section 3.8 we discussed Newcomb's
observation
that the front pages of a well-used table of logarithms tend to get
dirtier than the back pages do. What if we had an
\textit{antilogarithm}
table, that is, a table giving the value of $x$ when $\log_{10} x$
is given? Which pages of such a table would be the dirtiest?
\nextq We sometimes investigate distributions on a circle (for
example, von Mises' distribution which is discussed in Section
3.9 on ``The circular normal distribution'').
Find a Haar prior for a location parameter on the circle (such
as $\mu$ in the case of von Mises' distribution).
\nextq Suppose that the prior distribution $p(\mu, \sigma)$ for the
parameters $\mu$ and $\sigma$ of a Cauchy distribution
\[ p(x|\mu, \sigma)=\frac{1}{\pi}\frac{\sigma}{\sigma^2+(x-\mu)^2}
\]
is uniform in $\mu$ and $\sigma$, and that two observations
$x_1 = 2$ and $x_2 = 6$ are available from this distribution.
Calculate the
value of the posterior density $p(\mu, \sigma|\vect x)$ (ignoring
the factor $1/\pi^2$) to two decimal places for $\mu = 0, 2, 4, 6, 8$
and $\sigma = 1, 2, 3, 4, 5$. Use Simpson's rule to approximate the
posterior marginal density of $\mu$, and hence go on to find an
approximation to the posterior probability that $3 < \mu < 5$.
\nextq Show that if the log-likelihood $L(\theta|x)$ is a concave
function of $\theta$ for each scalar $x$ (that is,
$L''(\theta|x) \leqslant 0$
for all $\theta$), then the likelihood function $L(\theta|\vect x)$
for $\theta$ given an $n$-sample $\vect x = (x_1, x_2, \dots, x_n)$
has a unique maximum. Prove that this is the case if the
observations $x_i$ come from a logistic density
\[ p(\vect x|\theta)=
\exp(\theta-x)/\{1+\exp(\theta-x)\}^2\qquad(-\infty < x < \infty) \]
where $\theta$ is an unknown real parameter. Fill in the details
of the Newton-Raphson method and the method of scoring for finding
the position of the maximum, and suggest a suitable starting point
for the algorithms.
\blankline
[In many applications of Gibbs sampling, which we consider later
in Section 9.4, all full conditional densities are
log-concave (see Gilks \textit{et al.}, 1996, Section 5.3.3), so
the study of such densities is of real interest.]
\nextq Show that if an experiment consists of two observations, then the
total information it provides is the information provided by
one observation plus the mean amount provided by the second given
the first.
\nextq Find the entropy $H\{p(\theta)\}$ of a (negative) exponential
distribution with density
$p(\theta)=\beta^{-1}\exp(-\theta/\beta)$.
\section{Exercises on Chapter \arabic{section}}
\setcounter{qno}{0}
\nextq Show that if the prior probability $\pi_0$ of a hypothesis is
close to unity, then the posterior probability $p_0$ satisfies
$1-p_0\cong(1-\pi_0)B^{-1}$ and more exactly
$1-p_0\cong(1-\pi_0)B^{-1}+(1-\pi_0)^2(B^{-1}-B^{-2})$.
\nextq Watkins (1986, Section 13.3) reports that theory predicted the
existence of a Z
particle of mass $93.3 \pm 0.9$ GeV, while first
experimental results showed its mass to be $93.0 \pm 1.8$ GeV.
Find the prior and posterior odds and the Bayes ratio for the
hypothesis that its mass is less than 93.0 GeV.
\nextq An experimental station wishes to test whether a growth hormone
will increase the yield of wheat above the average value of 100
units per plot produced under currently standard conditions.
Twelve plots treated with the hormone give the yields:
\[
140,\quad 103,\quad 73,\quad 171,\quad 137,\quad 91,\quad 81,\quad
157,\quad 146,\quad 69,\quad 121,\quad 134.
\]
Find the $P$-value for the hypothesis under consideration.
\nextq In a genetic
experiment, theory predicts that if two genes are on
different chromosomes, then the probability of a certain event will
be 3/16. In an actual trial, the event occurs 56 times in 300.
Use Lindley's method to decide whether there is enough evidence to
reject the hypothesis that the genes are on the same chromosome.
\nextq With the data in the example in Section 3.4 on
``The Poisson distribution'', would it be appropriate to reject the
hypothesis that the true mean equalled the prior mean (that is,
that $\lambda=3$). [Use Lindley's method.]
\nextq Suppose that the standard test statistic
$z=(\mean x-\theta_0)/\sqrt{(\phi/n)}$ takes the value $z = 2.5$
and that the sample size is $n = 100$. How close to $\theta_0$
does a value of $\theta$ have to be for the value of the normal
likelihood function at $\mean x$ to be within 10\% of its value
at $\theta=\theta_0$?
\nextq Show that the Bayes factor for a test of a point null hypothesis
for the normal distribution (where the prior under the alternative
hypothesis is also normal) can be expanded in a power series in
$\lambda=\phi/n\psi$ as
\[ B = \lambda^{-\frac{1}{2}}\exp(-\half z^2)\{1+\half\lambda(z^2+1)
+\dots\}. \]
\nextq Suppose that $x_1$, $x_2$, $\dots$, $x_n\sim\N(0,\phi)$. Show
over the interval $(\phi-\varepsilon,\,\phi+\varepsilon)$ the
likelihood varies by a factor of approximately
\[ \exp\left\{
\frac{\varepsilon}{\phi}\left(\frac{\sum
x_i^2/n}{\phi}-1\right)\right\}.\]
\nextq At the beginning of Section 4.5, we saw that
under the alternative hypothesis that $\theta \sim \N(\theta_0,
\psi)$ the predictive density for $\mean x$ was
$\N(\theta_0, \psi+\phi/n)$, so that
\[ p_1(\mean x)=\{2\pi(\psi+\phi/n)\}^{-\frac{1}{2}}
\exp [-\half(\mean x - \theta_0)^2/(\psi+\phi/n)]
\]
Show that a maximum of this density considered as a function of
$\psi$ occurs when $\psi = (z^2 - 1)\phi/n$, which gives a
possible value for $\psi$ if $z \geqslant 1$. Hence show that
if $z \geqslant 1$ then for any such alternative hypothesis the
Bayes factor satisfies
\[ B \geqslant \sqrt{\text{e}}\, z \exp (-\half z^2) \]
and deduce a bound for $p_0$ (depending on the value of $\pi_0$).
\nextq In the situation discussed in Section 4.5,
for a given $P$-value (so equivalently for a given $z$) and
assuming that $\phi$=$\psi$, at what value of $n$ is the
posterior probability of the null hypothesis a minimum.
\nextq Mendel
(1865) reported finding 1850 angular wrinkled seeds to 5474
round or roundish in an experiment in which his theory predicted a
ratio of $1:3$. Use the method employed for Weldon's dice data in
Section 4.5 to test whether his theory is
confirmed by the data. [However, Fisher (1936) cast some doubt on
the genuineness of the data.]
\nextq A window
is broken in forcing entry to a house. The refractive index
of a piece of glass found at the scene of the crime is $x$, which
is supposed $\N(\theta_1, \phi)$. The refractive index of a piece
of glass found on a suspect is $y$, which is supposed $N(\theta_2,
\phi)$. In the process of establishing the guilt or innocence of the
suspect, we are interested in investigating whether
$\Hyp_0: \theta_1 = \theta_2$
is true or not. The prior distributions of $\theta_1$ and
$\theta_2$
are both $N(\mu, \psi)$ where $\psi\gg\phi$. Write
\[ u = x - y,\qquad z = \half(x + y).
\]
Show that, if $\Hyp_0$ is true and $\theta_1 = \theta_2 = \theta$,
then $\theta$, $x - \theta$ and $y - \theta$ are independent and
\[ \theta \sim \N(\mu, \psi),\qquad x - \theta \sim \N(0, \phi),
\qquad y - \theta \sim \N(0, \phi).
\]
By writing $u=(x-\theta)-(y-\theta)$ and
$z =\theta+ \half(x-\theta)+\half(y-\theta)$, go on to show that
$u$ has an $\N(0, 2\phi)$ distribution and that $z$ has an
$\N(\mu,\half\phi+\psi)$, so approximately an $\N(\mu, \psi)$,
distribution. Conversely, show that if $\Hyp_0$ is false and
$\theta_1$ and $\theta_2$ are assumed independent, then
$\theta_1$, $\theta_2$, $x - \theta_1$ and $y - \theta_2$ are all
independent and
\[ \theta_1\sim\N(\mu,\psi),\quad\theta_2\sim\N(\mu,\psi),\quad
x - \theta_1\sim\N(0, \phi),\quad y - \theta_2 \sim \N(0, \phi).
\]
By writing
\begin{align*}
u &= \theta_1 - \theta_2 + (x - \theta_1) - (y - \theta_2),
\\
z &= \half\{\theta_1+\theta_2+(x-\theta_1)+(y-\theta_2)\}
\end{align*}
show that in this case $u$ has an $\N(0, 2(\phi+\psi))$, so
approximately an $\N(0,2\psi)$, distribution, while $z$ has an
$\N(\mu,\half(\phi+ \psi))$, so approximately an
$\N(\mu,\half\psi)$,
distribution. Conclude that the Bayes factor is approximately
\[ B=\sqrt{(\psi/2\phi)}\exp[-\half u^2/2\phi+\half(z-\mu)^2/\psi].
\]
Suppose that the ratio $\sqrt{(\psi/\phi)}$ of the standard
deviations is 100 and that $u = 2\times\sqrt{(2\phi)}$, so
that the difference between $x$ and $y$ represents two
standard deviations, and that
$z = \mu$, so that both specimens are of commonly occurring glass.
Show that a classical test would reject $\Hyp_0$ at the 5\% level,
but that $B = 9.57$, so that the odds in favour of $\Hyp_0$ are
multiplied by a factor just below 10.
\blankline
[This problem is due to Lindley
(1977); see also Shafer (1982). Lindley
comments that, ``What the [classical] test fails to take into
account is the extraordinary coincidence of $x$ and $y$ being so
close together were the two pieces of glass truly different''.]
\nextq Lindley (1957) originally discussed his paradox
under slightly different assumptions from those made in this
book. Follow through the reasoning
used in Section 4.5 with $\rho_1(\theta)$
representing a uniform distribution
on the interval $(\theta_0-\half\tau,\,\theta_0+\half\tau)$
to find the corresponding Bayes factor assuming that
$\tau^2\gg\phi/n$,
so that an $\N(\mu, \phi/n)$ variable lies in this interval with
very high probability. Check that your answers are unlikely to
disagree with those found in Section 4.5 under the
assumption that $\rho_1(\theta)$ represents a normal density.
\nextq Express in your own words the arguments given by Jeffreys (1961,
Section 5.2) in favour of a Cauchy distribution
\[
\rho_1(\theta)=\frac{1}{\pi}\frac{\sqrt{\psi}}{\psi+(\theta-\theta_0)^2}
\]
in the problem discussed in the previous question.
\nextq Suppose that $x$ has a binomial distribution
$B(n, \theta)$ of index
$n$ and parameter $\theta$, and that it is desired to test
$\Hyp_0: \theta = \theta_0$ against the alternative hypothesis
$\Hyp_1: \theta\neq\theta_0$.
\begin{description}
\item[\quad(a)] Find lower bounds on the posterior probability of
$\Hyp_0$ and
on the Bayes factor for $\Hyp_0$ versus $\Hyp_1$, bounds which are
valid
for any $\rho_1(\theta)$.
\item[\quad(b)] If $n = 20$, $\theta_0 = \frac{1}{2}$ and $x = 15$ is
observed, calculate the (two-tailed) $P$-value and the lower bound
on the posterior probability when the prior probability $\pi_0$ of
the null hypothesis is $\half$.
\end{description}
\nextq Twelve observations from a normal distribution of mean $\theta$
and variance $\phi$ are available, of which the sample mean is 1.2
and the sample variance is 1.1. Compare the Bayes factors in
favour of the null hypothesis that $\theta=\theta_0$ assuming
(a) that $\phi$ is unknown and (b) that it is known that $\phi = 1$.
\nextq Suppose that in testing a point null hypothesis you find a value
of the usual Student's $\t$
statistic of 2.4 on 8 degrees of freedom. Would the methodology of
Section 4.6 require you to ``think again''?
\nextq Which entries in the table in Section 4.5
on ``Point null hypotheses for the normal distribution would,
according to the methodology of Section 4.6, cause you to
``think again''?
\section{Exercises on Chapter \arabic{section}}
\setcounter{qno}{0}
\nextq Two analysts measure the percentage of ammonia in a chemical
process over 9 days and find the following discrepancies
between their results:
\[
\begin{array}{lccccccccc}
\text{Day} & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9
\\
\text{Analyst A}&12.04&12.37&12.35&12.43&12.34&12.36&12.48&12.33&12.33
\\
\text{Analyst B}&12.18&12.37&12.38&12.36&12.47&12.48&12.57&12.28&12.42
\end{array}
\]
Investigate the mean discrepancy $\theta$ between their results
and in particular give an interval in which you are 90\% sure
that it lies.
\nextq With the same data as in the previous question, test the
hypothesis that there is no discrepancy between the two analysts.
\nextq Suppose that you have grounds for believing that observations
$x_i$, $y_i$ for $i=1$, 2, \dots, $n$ are such that
$x_i\sim\N(\theta,\phi_i)$ and also $y_i\sim\N(\theta,\phi_i)$, but
that you are not prepared to assume that the $\phi_i$ are equal.
What statistic would you expect to base inferences about $\theta$
on?
\nextq How much difference would it make to the analysis of the data in
Section 5.1 on rat diet if we took
$\omega=\half(\phi+\psi)$ instead of $\omega=\phi+\psi$.
\nextq Two analysts in the same laboratory made repeated determinations
of the percentage of fibre in soya cotton cake, the results being
as shown below:
\[
\begin{array}{llllllllll}
\text{Analyst A}&12.38&12.53&12.25&12.37&12.48&12.58&12.43&12.43&12.30
\\
\text{Analyst B}&12.25&12.45&12.31&12.31&12.30&12.20&12.25&12.25&12.26
\\
&12.42&12.17&12.09
\end{array}
\]
Investigate the mean discrepancy $\theta$ between their mean
determinations and in particular give an interval in which you are
90\% sure that it lies
\begin{description}
\item[\quad(a)] assuming that it is known from past experience that the
standard deviation of both sets of observations is 0.1, and
\item[\quad(b)] assuming simply that it is known that the standard
deviations of the two sets of observations are equal.
\end{description}
\nextq A random sample $\vect x = (x_1, x_2, \dots, x_m)$ is available
from an $\N(\lambda,\phi)$ distribution and a second independent
random sample $\vect y = (y_1, y_2, \dots, y_n)$ is available
from an $\N(\mu, 2\phi)$ distribution. Obtain, under the usual
assumptions, the posterior distributions of $\lambda-\mu$ and of
$\phi$.
\nextq Verify the formula for $S_1$ given towards the end of Section
5.2.
\nextq The following data consists of the lengths in mm of cuckoo's eggs
found in nests belonging to the dunnock and to the reed warbler:
\[
\begin{array}{lllllllllll}
\text{Dunnock} &22.0&23.9&20.9&23.8&25.0&24.0&21.7&23.8&22.8&23.1
\\
\text{Reed warbler} &23.2&22.0&22.2&21.2&21.6&21.9&22.0&22.9&22.8
\end{array}
\]
Investigate the difference $\theta$ between these lengths without
making any particular assumptions about the variances of the two
populations, and in particular give an interval in which you are
90\% sure that it lies.
\nextq Show that if $m=n$ then the expression $f_1^2/f_2$ in Patil's
approximation reduces to
\[ \frac{4(m-5)}{3 + \cos 4\theta}. \]
\nextq Suppose that $T_x$, $T_y$ and $\theta$ are defined as in Section
5.3
and that
\[ T=T_x\sin\theta-T_y\cos\theta,\qquad
U=T_x\cos\theta+T_y\sin\theta \]
Show that the transformation from $(T_x, T_y)$ to $(T, U)$ has unit
Jacobian and hence show that the density of $T$ satisfies
\begin{align*}
p(T|\vect x,\vect y) &\propto \int_0^{\infty}
[1 + (T\sin\theta+U\cos\theta)^2/\nu_x]^{-(\nu(x)+1)/2}
\\
&\qquad\times[1 +
(-T\cos\theta+U\sin\theta)^2/\nu_y]^{-(\nu(y)+1)/2}\dU.
\end{align*}
\nextq Show that if $x\sim\F_{\nu_1,\nu_2}$ then
\[ \frac{\nu_1x}{\nu_2+\nu_1x} \sim \Be(\half\nu_1,\half\nu_2). \]
\nextq Two different microscopic methods, $A$ and $B$, are available for
the measurement of very small dimensions in microns. As a result of
several such measurements on the same object, estimates of variance
are available as follows:
\[
\begin{array}{lll}
\text{Method}\hspace{25mm} & A\hspace{20mm}
& B \\
\text{No. of observations} & m = 15 & n = 25
\\
\text{Estimated variance} & s_1^2 = 7.533 & s_2^2 =
1.112
\end{array}
\]
\,\,Give an interval in which you are 95\% sure that the ratio
of the variances lies.
\nextq Measurement errors when using two different instruments are
more or less symmetrically distributed and are believed to be
reasonably well approximated by a normal distribution. Ten
measurements with each show a sample standard deviation three times
as large with one instrument as with the other. Give an interval
in which you are 99\% sure that the ratio of the true standard
deviations lies.
\nextq Repeat the analysis of Di Raimondo's data in Section
5.6 on the effects of penicillin of mice, this
time assuming that you have prior knowledge worth about six
observations in each case suggesting that the mean chance of
survival is about a half with the standard injection but about
two-thirds with the penicillin injection.
\nextq The table below [quoted from Jeffreys (1961, Section 5.1)]
gives the relationship between grammatical gender in Welsh and
psychoanalytical symbolism according to Freud:
\[
\begin{array}{lll}
\text{Psycho. $\backslash$ Gram.}\hspace{5mm} & M\hspace{10mm} & F
\\
M & 45 &
30 \\
F & 28 &
29 \\
\text{Total} & 73 &
59
\end{array}
\]
Find the posterior probability that the log odds-ratio is positive
and compare it with the comparable probability found by using the
inverse root-sine transformation.
\nextq Show that if $\pi \cong \rho$ then the log odds-ratio is such
that
\[ \Lambda-\Lambda' \cong (\pi-\rho)/\{\pi(1 - \pi)\}.
\]
\nextq A report issued in 1966 about the effect of radiation on patients
with inoperable lung cancer compared the effect of radiation
treatment with placebos. The numbers surviving after a year were:
{\catcode`?=\active
\def?{\kern\digitwidth}
\[
\begin{array}{lll}
\hspace{25mm} & \text{Radiation}\hspace{5mm} & \text{Placebos}
\\
\text{No. of cases} & 308 & 246
\\
\text{No. surviving} & ?56 & ?34
\end{array}
\]
}
\!\!What are the approximate posterior odds that the one-year
survival rate of irradiated patients is at least 0.01 greater than
that of those who were not irradiated?
\nextq Suppose that $x \sim \P(8.5)$, i.e. $x$ is Poisson
of mean 8.5,
and $y \sim \P(11.0)$. What is the approximate distribution of
$x - y$?
\section{Exercises on Chapter \arabic{section}}
\setcounter{qno}{0}
\nextq The sample correlation coefficient
between length and weight of a
species of frog was determined at each of a number of sites. The
results were as follows:
{\catcode`?=\active
\def?{\kern\digitwidth}
\[
\begin{array}{llllll}
\text{Site}& ?1& ?2& ?3& ?4& ?5
\\
\text{Number of frogs}& 12& 45& 23& 19& 30
\\
\text{Correlation}& ?0.631& ?0.712& ?0.445& ?0.696& ?0.535
\end{array}
\]
}
Find an interval in which you are 95\% sure that the correlation
coefficient lies.
\nextq Three groups of children were given two tests. The numbers of
children and the sample correlation coefficients
between the two test
scores in each group were as follows:
\[
\begin{array}{lccc}
\text{Number of children}& 45& 34& 49 \\
\text{Correlation}& 0.489& 0.545& 0.601
\end{array}
\]
Is there any evidence that the association between the two tests
differs in the three groups?
\nextq Suppose you have sample correlation coefficients $r_1$, $r_2$,
$\dots$, $r_k$ on the basis of sample sizes $n_1$, $n_2$, $\dots$,
$n_k$. Give a 95\% posterior confidence interval for
$\zeta=\tanh^{-1}\rho$.
\nextq From the approximation
\[ p(\rho|\vect x,\vect y) \propto (1 - \rho^2)^{n/2}(1 - \rho r)^{-n}
\]
which holds for large $n$, deduce an expression for the
log-likelihood $L(\rho|\vect x,\vect y)$ and hence show that
the maximum likelihood occurs when $\rho = r$. An approximation
to the information can now be
made by replacing $r$ by $\rho$ in the second derivative of the
likelihood, since $\rho$ is near $r$ with high probability. Show
that this approximation suggests a prior density of the form
\[ p(\rho) \propto (1 - \rho^2)^{-1}. \]
\nextq Use the fact that
\[ \int_0^{\infty} (\cosh t+\cos\theta)^{-1}\dt=\theta/\sin\theta \]
(cf.\ Edwards, 1921, art.\ 180) to show that
\[ p(\rho|\vect x,\vect y) \propto p(\rho) (1 - \rho^2)^{(n-1)/2}
\frac{d^{n-2}}{\d(\rho r)^{n-2}}
\left(\frac{\arccos(-\rho r)}{\sqrt{(1-\rho^2r^2)}}\right). \]
\nextq Show that in the special case where the sample correlation
coefficient
$r = 0$ and the prior takes the special form
$p(\rho) \propto (1 - \rho^2)^k$ the variable
\[ \sqrt{(k + n + 1)} \rho/(1 - \rho^2) \]
has a Student's $\t$ distribution on $k + n + 1$ degrees of
freedom.
\nextq By writing
\begin{align*}
\omega^{-1}(\omega + \omega^{-1} - 2\rho r)^{-(n-1)}
&= \omega^{n-2}(1 - \rho^2)^{-(n-1)} \\
&\qquad\times [1+(\omega-\rho r)^2 (1-\rho^2r^2)^{-1}]^{-(n-1)}
\end{align*}
and using repeated integration by parts, show that the posterior
distribution of $\rho$ can be expressed as a finite series
involving powers of
\[ \sqrt{(1 - \rho r)/(1 + \rho r)} \]
and Student's $\t$ integrals.
\nextq By substituting
\[ \cosh t - \rho r = \frac{1-\rho r}{1-u} \]
in the form
\[ p(\rho|\vect x,\vect y) \propto p(\rho) (1 - \rho^2)^{(n-1)/2}
\int_0^{\infty} (\cosh t - \rho r)^{-(n-1)} \dt
\]
for the posterior density of the correlation coefficient
and then expanding
\[ [1 - \half(1 + \rho r)u]^{-\frac{1}{2}}
\]
as a power series in u, show that the integral can be expressed as
a series of beta functions. Hence deduce that
\[ p(\rho|\vect x,\vect y) \propto p(\rho) (1 - \rho^2)^{(n-1)/2}
(1-\rho r)^{-n+(3/2)}S_n(\rho r)
\]
where
\[ S_n(\rho r)=1 + \sum_{l=1}^{\infty} \frac{1}{l!}
\left(\frac{1+\rho r}{8}\right)^l
\prod_{s=1}^l \frac{(2 s-1)^2}{(n-\frac{3}{2}+s)}.
\]
\nextq Fill in the details of the derivation of the prior
\[ p(\phi,\psi,\rho) \propto (\phi\psi)^{-1} (1 - \rho^2)^{-3/2}
\]
from Jeffreys' rule
as outlined at the end of Section 6.1.
\nextq The data below consist of the estimated gestational ages (in
weeks) and weights (in grammes) of twelve female babies:
{
\[
\begin{array}{lc@{}c@{}c@{}c@{}c@{}c@{}c@{}c@{}c@{}c@{}c@{}c}
\text{Age}& 40 & 36 & 40 & 38 & 42 & 39 & 40 & 37 & 36 & 38 & 39 & 40
\\
\text{Weight}&\,3317\,&\,2729\,&\,2935\,&\,2754\,&\,3210\,&\,2817\,&
\,3126\,&\,2539\,&\,2412\,&\,2991\,&\,2875\,&\,3231\,
\end{array}
\]
}
Give an interval in which you are 90\% sure that the gestational
age of a particular such baby will lie if its weight is 3000
grammes, and give a similar interval in which the mean weight of
all such babies lies.
\nextq Show directly from the definitions that, in the notation of
Section 6.3,
\[ S_{ee} = \sum\{y_i - a - b(x_i - \mean x)\}^2. \]
\nextq Observations $y_i$ for $i = -m, -m + 1, \dots, m$ are available
which satisfy the regression
model
\[ y_i \sim \N(\alpha + \beta u_i + \gamma v_i,\,\phi) \]
where $u_i = i$ and $v_i = i^2 - \half m(m + 1)$. Adopting the
standard reference prior
$p(\alpha, \beta, \gamma, \phi)$ $\propto 1/\phi$,
show that the posterior distribution of $\alpha$ is such that
\[ \frac{\alpha-\mean y}{s/\sqrt{n}}\sim\t_{n-3} \]
where $n = 2 m + 1$, $s^2 = S_{ee}/(n - 3)$ and
\[ S_{ee} = S_{yy} - S_{uy}^2/S_{uu} - S_{vy}^2/S_{vv} \]
in which $S_{yy}$, $S_{uy}$, etc., are defined by
\[ S_{yy} = \sum (y_i - \mean y)^2,\qquad
S_{uy} = \sum (u_i - \mean u))(y_i - \mean y). \]
[\textit{Hint:} Note that $\sum u_i = \sum v_i = \sum u_iv_i = 0$,
and hence $\mean u=\mean v=0$ and $S_{uy}=0$.]
\nextq Fisher (1925b, Section 41) quotes an experiment on the accuracy
of counting soil bacteria. In it, a soil sample was divided into
four parallel samples, and from each of theses after dilution seven
plates were inoculated. The number of colonies on each plate is
shown below. Do the results from the four samples agree within
the limits of random sampling?
\[
\begin{array}{lcccc}
\text{Plate $\backslash$ Sample}\hspace{5mm} &
\hspace{5mm}\text{A}\hspace{5mm}&
\hspace{5mm}\text{B}\hspace{5mm}&
\hspace{5mm}\text{C}\hspace{5mm}&
\hspace{5mm}\text{D}\hspace{5mm}
\\
\quad 1 & 72 & 74 & 78 & 69
\\
\quad 2 & 69 & 72 & 74 & 67
\\
\quad 3 & 63 & 70 & 70 & 66
\\
\quad 4 & 59 & 69 & 58 & 64
\\
\quad 5 & 59 & 66 & 58 & 64
\\
\quad 6 & 53 & 58 & 56 & 58
\\
\quad 7 & 51 & 52 & 56 & 54
\end{array}
\]
\nextq In the case of the data on scab disease quoted in Section
6.5, find a contrast measuring the effect of the
season in which sulphur is applied and give an appropriate HDR for
this contrast.
\nextq The data below [from Wishart and Sanders (1955, Table 5.6)]
represent the weight of green produce in pounds made on an
old pasture. There were three main treatments, including a
control (O) consisting of the untreated land. In the
other cases the effect of a grass-land rejuvenator (R) was
compared with the use of the harrow (H). The blocks were
therefore composed of three plots each, and the experiment
consisted of six randomized blocks placed side by side. The
plan and yields were as follows:
\vspace{0.75\baselineskip}
{\small
\hbox{
\hspace{-1.2em}
\begin{tabular}{c@{\ }c@{\ }c|c@{\ }c@{\ }c|c@{\ }c@{\ }c|
c@{\ }c@{\ }c|c@{\ }c@{\ }c|c@{\ }c@{\ }c}
O.& H.& R.& R.& H.& O.& O.& R.& H.& O.& R.& H.& H.& O.& R.& O.&
H.& R.\\
813&647&713&814&759&795&705&652&598&774&617&559&580&687&539&581&480&437
\end{tabular}
}
}
\vspace{0.75\baselineskip}
\noindent
Derive an appropriate two-way analysis of variance.
\nextq Express the two-way layout as a particular case of the general
linear model.
\nextq Show that the matrix
$\matr A^{+}=(\matr A\transpose\matr A)^{-1}\matr A\transpose$
which arises in the theory of the general linear model is a
\textit{generalized inverse} of the (usually non-square) matrix
$\matr A$ in that
\begin{description}
\item[\quad(a)] $\matr A\matr A^{+}\matr A=\matr A$
\item[\quad(b)] $\matr A^{+}\matr A\matr A^{+}=\matr A^{+}$
\item[\quad(c)] $(\matr A\matr A^{+})\transpose=\matr A\matr A^{+}$
\item[\quad(d)] $(\matr A^{+}\matr A)\transpose=\matr A^{+}\matr A$
\end{description}
\nextq Express the bivariate linear regression model in terms of the
original parameters $\beeta=(\eta_0,\eta_1)\transpose$ and the
matrix $\matr A_0$ and use the general linear model to find the
posterior distribution of $\beeta$.
\section{Exercises on Chapter \arabic{section}}
\setcounter{qno}{0}
\nextq Show that in any experiment $E$ in which there is a possible
value $y$ for the random variable $\random x$ such that
$p_{\random x}(y|\theta) = 0$, then if $z$ is any other possible
value
of $\random x$, the statistic $t = t(x)$ defined by
\[ t(x) = \left\{\begin{array}{ll}
z & \text{if $x=y$} \\
x & \text{if $x\neq y$}
\end{array}\right. \]
is sufficient
for $\theta$ given $x$. Hence show that if $\random x$ is a
continuous random variable, then a na\"\i ve application of the
weak sufficiency principle as defined in Section 7.1
would result in $\Ev\{E, y, \theta\} = \Ev\{E, z, \theta\}$ for any
two possible values $y$ and $z$ of $\random x$.
\nextq Consider an experiment $E = \{\random x,\theta, p(x|\theta)\}$.
We say that \textit{censoring} (strictly speaking, fixed censoring)
occurs with censoring mechanism $g$ (a known function of $x$) when,
instead of $\random x$, one observes $y=g(x)$. A typical example
occurs when we report $x$ if $x < k$ for some fixed $k$, but
otherwise simply report that $x\geqslant k$. As a result, the
experiment really performed is
$E^g = \{\random y,\theta, p(y|\theta)\}$.
A second method with censoring mechanism $h$ is said to be
\textit{equivalent} to the first when
\[ g(x)=g(x')\text{\quad if and only if\quad}h(x)=h(x'). \]
As a special case, if $g$ is one-to-one then the mechanism is said
to be equivalent to no censoring. Show that if two censoring
mechanisms are equivalent, then the likelihood principle implies
that
\[ \Ev\{E^g, x, \theta\}=\Ev\{E^h, x, \theta\}. \]
\nextq Suppose that the density function $p(x|\theta)$ is defined as
follows for $x = 1, 2, 3, \dots$ and $\theta = 1, 2, 3, \dots$.
If $\theta$ is even, then
\begin{align*}
p(x|\theta)&=\left\{\begin{array}{ll}
\frac{1}{3} & \text{if $x=\theta/2$, $2\theta$
or $2\theta+1$}
\\
0 & \text{otherwise}
\end{array}\right. \\
\intertext{if $\theta$ is odd but $\theta\neq1$, then}
p(x|\theta)&=\left\{\begin{array}{ll}
\frac{1}{3} & \text{if $x=(\theta-1)/2$,
$2\theta$
or $2\theta+1$}
\\
0 & \text{otherwise}
\end{array}\right. \\
\intertext{while if $\theta = 1$ then}
p(x|\theta)&=\left\{\begin{array}{ll}
\frac{1}{3} & \text{if $x=\theta$, $2\theta$
or $2\theta+1$}
\\
0 & \text{otherwise}
\end{array}\right.
\end{align*}
Show that, for any $x$ the data intuitively give equal support
to the three possible values of $\theta$ compatible with that
observation, and hence that on likelihood grounds any of the three
would be a suitable estimate. Consider, therefore, the three
possible estimators $d_1$, $d_2$ and $d_3$ corresponding to the
smallest, middle and largest possible $\theta$. Show that
\begin{align*}
p(d_2=1)&=\left\{\begin{array}{ll}
\frac{1}{3} & \text{when $\theta$ is even} \\
0 & \text{otherwise}
\end{array}\right. \\
\ \\
p(d_3=1)&=\left\{\begin{array}{ll}
\frac{1}{3} &
\text{when $\theta$ is odd but
$\theta\neq1$} \\
0 & \text{otherwise}
\end{array}\right. \\
\intertext{but that}
p(d_1=1)&=\left\{\begin{array}{ll}
1 & \text{when $\theta=1$} \\
\frac{2}{3} & \text{otherwise}
\end{array}\right.
\end{align*}
Does this apparent discrepancy cause any problems for a Bayesian
analysis? (due to G.~Monette and D.~A.~S.~Fraser).
\nextq A drunken soldier,
starting at an intersection O in a city which
has square blocks, staggers around a random path trailing a taut
string. Eventually he stops at an intersection (after walking
at least one block) and buries a treasure. Let $\theta$ denote
the path of the string from O to the treasure. Letting $N$, $S$,
$E$ and $W$ stand for a path segment one block long in the
indicated direction, so that $\theta$ can be expressed as a
sequence of such letters, say $\theta =\!\!\textit{NNESWSWW}$.
(Note that $NS$, $SN$, $EW$ and $WE$ cannot appear as the taut
string would be rewound).
After burying the treasure, the soldier walks one block further in
a random direction (still keeping the string taut). Let $X$ denote
this augmented path, so that $X$ is one of $\theta N$, $\theta S$,
$\theta E$ and $\theta W$, each with probability $\quarter$. You
observe $X$ and are then to find the treasure. Show that if you
use a reference
prior $p(\theta) \propto 1$ for all possible paths
$\theta$, then all four possible values of $\theta$ given $X$ are
equally likely. Note, however, that intuition would suggest that
$\theta$ is three times as likely to extend the path as to
backtrack, suggesting that one particular value of $\theta$ is
more likely than
the others after $X$ is observed. (Due to M. Stone).
\nextq Suppose that, starting with a fortune of $f_0$ units, you bet $a$
units each time on evens at roulette (so that you have a
probability of 18/37 of winning at Monte Carlo or 18/38 at Las
Vegas) and keep a record of your fortune $f_n$ and the difference
$d_n$ between the number of times you win and the number of times
you lose in $n$ games. Which of the following are stopping times?
\begin{description}
\item[\quad(a)] The last time $n$ at which $f_n\geqslant f_0$?
\item[\quad(b)] The first time that you win in three successive games?
\item[\quad(c)] The value of $n$ for which
$f_n=\max_{\,\{0\leqslant k < \infty\}} f_k$ ?
\end{description}
\nextq Suppose that $x_1, x_2, \dots$ is a sequential sample from an
$\N(\theta, 1)$ distribution and it is desired to test
$\Hyp_0: \theta = \theta_0$ versus $\Hyp_1: \theta\neq\theta_0$.
The experimenter reports that he used a proper stopping rule
and obtained the data 3, $-1$, 2, 1.
\begin{description}
\item[(a)] What could a frequentist conclude?
\item[(b)] What could a Bayesian conclude?
\end{description}
\nextq Let $x_1, x_2, \dots$ be a sequential sample from a Poisson
distribution
$\P(\lambda$). Suppose that the stopping rule is to
stop sampling at time $n \geqslant 2$ with probability
\[ \sum_{i=1}^{n-1} x_i \left/ \sum_{i=1}^n x_i\right. \]
for $n = 2, 3, \dots$ (define $0/0 = 1$). Suppose that the first
five observations are 3, 1, 2, 5, 7 and that sampling then stops.
Find the likelihood function for $\lambda$. (Berger, 1985).
\nextq Show that the mean of the beta-Pascal distribution
\[ p(S|R, r, s)=\binom{S}{s}\frac{\Betafn(r''+s, R''-r''+S-s)}
{\Betafn(r''-1, R''-r'')} \]
is given by the formula in Section 7.3, namely,
\[ \E S=(s+1)\left(\frac{R''-2}{r''-2}\right)-1 \]
\nextq Suppose that you intend to observe the number $x$ of successes
in $n$ Bernoulli trials
and the number $y$ of failures before the
$n$th success after the first $n$ trials, so that $x\sim\B(n,\pi)$
and $y\sim\NB(n,\pi)$.
Find the likelihood function $L(\pi|x, y)$
and deduce the reference prior
that Jeffreys' rule
would suggest for this case.
\nextq The negative of loss is sometimes referred to as
\textit{utility}. Consider a gambling game very unlike most in
that you are bound to win at least $\pounds 2$, and accordingly in
order to be allowed to play, you must pay an entry fee of
$\pounds e$.
A coin is tossed until it comes up heads, and if this occurs
for the first time on the $n$th toss, you receive $\pounds 2^n$.
Assuming that the utility to you of making a gain of $\pounds x$ is
$u(x)$, find the expected utility of this game, and then discuss
whether it is plausible that $u(x)$ is directly proportional to
$x$. [The gamble discussed here is known as the \textit{St
Petersburg Paradox}. A fuller discussion of it can be found in
Leonard and Hsu (1999, Chapter 4).]
\nextq Suppose that you want to estimate the parameter $\pi$ of a
binomial
distribution $\B(n, \pi)$. Show that if the loss function is
\[ L(\theta, a) = (\theta - a)^2/\{\theta(1 - \theta)\} \]
then the Bayes rule corresponding to a uniform (that is,
$\Be(1,1)$) prior for $\pi$ is given by $d(x) = x/n$ for any
$x$ such that $0 < x < n$, that is, the maximum likelihood
estimator. Is $d(x) = x/n$
a Bayes rule if $x = 0$ or $x = n$?
\nextq Let $x\sim\B(n, \pi)$ and $y\sim\B(n, \rho)$ have independent
binomial distributions
of the same index but possibly different
parameters. Find the Bayes rule
corresponding to the loss
\[ L((\pi,\rho), a) = (\pi - \rho - a)^2 \]
when the priors for $\pi$ and $\rho$ are independent uniform
distributions.
\nextq Investigate possible point estimators for $\pi$ on the
basis of the posterior distribution in the example in
the subsection of Section 2.10 headed
``Mixtures of conjugate densities''.
\nextq Find the Bayes rule corresponding to the loss function
\[ L(\theta, a)=\left\{\begin{array}{ll}
u(a-\theta) & \mbox{if $a\leqslant\theta$}
\\
v(\theta-a) & \mbox{if $a\geqslant\theta$}.
\end{array}\right. \]
\nextq Suppose that your prior for the proportion $\pi$ of defective
items supplied by a manufacturer is given by the beta distribution
$\Be(2, 12)$, and that you then observe that none of a random
sample of size 6 is defective. Find the posterior distribution
and use it to carry out a test of the hypothesis
$\Hyp_0: \pi < 0.1$ using
\begin{description}
\item[(a)] a ``0 -- 1'' loss function, and
\item[(b)] the loss function
\[
\hspace{-0.9cm}
\begin{array}{lll}
a\backslash\theta\hspace{10mm} & \theta\in\Theta_0\hspace{10mm} &
\theta\in\Theta_1 \\
a_0 & 0 & 1 \\
a_1 & 2 & 0
\end{array}
\]
\end{description}
\nextq Suppose there is a loss function $L(\theta, a)$ defined by
\[
{\catcode`?=\active
\def?{\kern\digitwidth}
\begin{array}{lll}
a\backslash\theta\hspace{10mm} & \theta\in\Theta_0\hspace{10mm} &
\theta\in\Theta_1 \\
a_0 & ?0 & 10
\\
a_1 & 10 & ?0
\\
a_2 & ?3 & ?3
\end{array}
}
\]
On the basis of an observation $x$ you have to take action
$a_0$, $a_1$ or $a_2$. For what values of the posterior
probabilities $p_0$ and $p_1$ of the hypotheses
$\Hyp_0: \theta\in\Theta_0$ and $\Hyp_1:
\theta\in\Theta_1$ would you take each of the possible actions?
\nextq A child is given an intelligence test. We assume that the test
result $x$ is $\N(\theta, 100)$ where $\theta$ is the true
intelligence quotient of the child, as measured by the test
(in other words, if
the child took a large number of similar tests, the average score
would be $\theta$). Assume also that, in the population as a
whole, $\theta$ is distributed according to an $\N(100, 225)$
distribution. If it is desired, on the basis of the intelligence
quotient, to decide whether to put the child into a slow, average
or fast group for reading, the actions available are:
\blankline
$a_1:$ Put in slow group, that is, decide
$\theta\in\Theta_1 = (0, 90)$
$a_1:$ Put in average group, that is, decide
$\theta\in\Theta_2 = [90, 100]$
$a_1:$ Put in fast group, that is, decide
$\theta\in\Theta_3 = (100, \infty).$
\blankline
A loss function $L(\theta, a)$ of the following form might be deemed
appropriate:
\[
\begin{array}{llll}
a\backslash\theta\quad\
& \theta\in\Theta_1 & \theta\in\Theta_2
& \theta\in\Theta_3
\\
a_1 & 0 & \theta-90 &
2(\theta-90) \\
a_2 & 90-\theta & 0 & \theta-110
\\
a_3 & 2(110-\theta)\quad & 110-\theta\quad & 0
\end{array}
\]
Assume that you observe that the test result $x = 115$. By using
tables of the normal distribution and the fact that if $\phi(t)$ is
the density function of the standard normal distribution, then
$\int t \phi(t) \dt = -\phi(t)$, find is the appropriate action to
take on the basis of this observation. [See Berger (1985,
Sections 4.2, 4.3, 4.4)].
\nextq In Section 7.8, a point estimator
$\delta_n$ for the current value
$\lambda$ of the parameter of a Poisson distribution
was found. Adapt
the argument to deal with the case where the underlying
distribution is geometric, that is
\[ p(x|\pi) = \pi(1-\pi)^x. \]
Generalize to the case of a negative binomial distribution, that
is,
\[ p(x|\pi) = \binom{n+x-1}{x} \pi^n (1-\pi)^x. \]
\section{Exercises on Chapter \arabic{section}}
\setcounter{qno}{0}
\nextq Show that the prior
\[ p(\alpha,\beta) \propto (\alpha + \beta)^{-5/2} \]
suggested in connection with the example on risk of tumour in a
group of rats is equivalent to a density uniform in
$\left(\alpha/(\alpha+\beta),\,(\alpha+\beta)^{-1/2}\right)$.
\nextq Observations $x_1$, $x_2$, \dots, $x_n$ are independently
distributed given parameters $\theta_1$, $\theta_2$, \dots,
$\theta_n$ according to the Poisson distribution
$p(x_i|\btheta)=\theta_i^{x_i}\exp(-\theta_i)/x_i!$. The prior
distribution for $\btheta$ is constructed hierarchically. First
the $\theta_i$s are assumed to be independently identically
distributed
given a hyperparameter $\phi$ according to the exponential
distribution $p(\theta_i|\phi)=\phi\exp(-\phi\theta_i)$ for
$\theta_i\geqslant 0$ and then $\phi$ is given the improper uniform
prior $p(\phi)\propto 1$ for $\phi\geqslant 0$. Provided that
$n\mean x>1$, prove that the posterior distribution of
$z=1/(1+\phi)$ has the beta form
\[ p(z|\vect x)\propto z^{n\mean x-2}(1-z)^n. \]
Thereby show that the posterior means of the $\theta_i$ are shrunk
by a factor $(n\mean x-1)/(n\mean x+n)$ relative to the usual
classical procedure which estimates each of the $\theta_i$ by
$x_i$.
What happens if $n\mean x\leqslant 1$?
\nextq Carry out the Bayesian analysis for known overall mean developed
in Section 8.2 above (a) with the loss
function replaced by a weighted mean
\[ L(\btheta,\est\btheta)=\sum_{i=1}^r w_i(\theta_i-\est\theta_i)^2, \]
and (b) with it replaced by
\[ L(\btheta,\est\btheta)=\sum_{i=1}^r |\theta_i-\est\theta_i|. \]
\nextq Compare the effect of the Efron-Morris estimator on the baseball
data in Section 8.3 with the effect of a James-Stein
estimator which shrinks the values of $\pi_i$ towards
$\pi_0=0.25$ or equivalently shrinks the values of $X_i$ towards
$\mu=2\sqrt{n}\sin^{-1}\sqrt{\pi_0}$.
\nextq The \textit{Helmert transformation} is defined by the matrix
\[ \matr A=\left(\begin{array}{cccccc}
r^{-1/2}&2^{-1/2} &6^{-1/2}&12^{-1/2} &\dots
&\{r(r-1)\}^{-1/2} \\
r^{-1/2}&-2^{-1/2}&6^{-1/2}&12^{-1/2} &\dots
&\{r(r-1)\}^{-1/2} \\
r^{-1/2}&0 &-2\times 6^{-1/2}&12^{-1/2}&\dots
&\{r(r-1)\}^{-1/2} \\
r^{-1/2}&0 &0&-3\times 12^{-1/2} &\dots
&\{r(r-1)\}^{-1/2} \\
r^{-1/2}&0 &0 &0 &\dots
&\{r(r-1)\}^{-1/2} \\
\vdots &\vdots &\vdots &\vdots &\ddots&\vdots \\
r^{-1/2}&0 &0 &0 &\dots
&-(r-1)^{1/2}r^{-1/2} \\
\end{array}\right)
\]
so that the element $a_{ij}$ in row $i$, column $j$ is
\[ a_{ij}=\left\{\begin{array}{ll}
r^{-1/2}&\quad(j=1) \\ \{j(j-1)\}^{-1/2} &\quad(i < j) \\
0 &\quad(i>j>1)\\ -(j-1)^{1/2}j^{-1/2}&\quad(i=j>1).
\end{array}\right. \]
It is also useful to write $\balpha_j$ for the (column) vector
which consists of the $j$th column of the matrix $\matr A$. Show
that if the variates $X_i$ are independently $\N(\theta_i, 1)$,
then the variates
$W_j=\balpha_j\transpose(\vect X-\bmu)=\sum_i a_{ij}(X_i-\mu_j)$
are independently normally distributed with unit variance and
such that $\E W_j=0$ for $j>1$ and
\[ \vect W\transpose\vect W=\sum_j W_j^2=\sum_i (X_i-\mu_i)^2
=(\vect X-\bmu)\transpose(\vect X-\bmu). \]
By taking $a_{ij}\propto\theta_j-\mu_j$ for $i>j$, $a_{ij}=0$ for
$i < j$ and $a_{jj}$ such that $\sum_j a_{ij}=0$, extend this result
to the general case and show that
$\E\,W_1\propto\gamma=\sum_i(\theta_i-\mu_i)^2$.
Deduce that the distribution of a non-central chi-squared variate
depends only of $r$ and $\gamma$.
\nextq Show that $R(\btheta,\bhthetajsplus) < R(\btheta,\bhthetajs)$ where
\[
\bhthetajsplus=\bmu+\max\left[\left(1-\frac{r-2}{S_1}\right),\,0\right]
(\vect X-\bmu) \]
(Lehmann 1983, Section 4.6, Theorem 6.2).
\nextq Writing
\[ \est\btheta=(\matr A\transpose\matr A)^{-1}
\matr A\transpose\vect x,\qquad
\est\btheta_k=(\matr A\transpose\matr A+k\matr I)^{-1}
\matr A\transpose\vect x \]
for the least-squares and ridge regression estimators for
regression coefficients $\btheta$, show that
\[ \est\btheta-\est\btheta_k=k(\matr A\transpose\matr
A)^{-1}\est\btheta_k \]
and that the bias of $\est\btheta_k$ is
\[ \vect b(k)=\{(\matr A\transpose\matr A+k\matr I)^{-1}
\matr A\transpose\matr A- \matr I\}\btheta \]
while its variance-covariance matrix is
\[ \Var\est\btheta_k=\phi(\matr A\transpose\matr A+k\matr I)^{-1}
\matr A\transpose\matr A(\matr A\transpose\matr A+k\matr I)^{-1}. \]
Deduce expressions for the sum $\mathcal G(k)$ of the squares of
the biases and for the sum $\mathcal F(k)$ of the variances of the
regression coefficients, and hence show that the mean square error
is
\[ MSE_k=\E(\est\theta_b-\theta)\transpose(\est\theta_k-\theta)
=\mathcal F(k)+\mathcal G(k). \]
Assuming that $\mathcal F(k)$ is continuous and monotonic
decreasing with $\mathcal F^{\,\prime}(0)=0$ and that
$\mathcal G(k)$ is
continuous and monotonic increasing with $\mathcal G(k)=
\mathcal G^{\,\prime}(k)=0$, deduce that there always exists a $k$
such that $MSE_k < MSE_0$ (Theobald, 1974).
\nextq Show that the matrix $\matr H$ in Section 8.6
satisfies $\matr B\transpose\matr H^{-1}\matr B=\matr 0$ and that
if $\matr B$ is square and non-singular then $\matr H^{-1}$ vanishes.
\nextq Consider the following particular case of the two way layout.
Suppose that eight plots are harvested on four of which one variety
has been sown, while a different variety has been sown on the other
four. Of the four plots with each variety, two different
fertilizers have been used on two each. The yield will be
normally distributed with a mean $\theta$ dependent on the
fertiliser and the variety and with variance $\phi$. It is
supposed \textit{a priori} that the mean
for plots yields sown with the two different varieties are
independently normally distributed with mean $\alpha$ and variance
$\psi_{\alpha}$, while the effect of the two different fertilizers
will add an amount which is independently normally distributed with
mean $\beta$ and variance $\psi_{\beta}$. This fits into the
situation described in Section 8.6 with $\Phi$
being $\phi$ times an $8\times 8$ identity matrix and
\[ \matr A=
\left(\begin{array}{cccc}1&0&1&0\\1&0&1&0\\1&0&0&1\\1&0&0&1\\
0&1&1&0\\0&1&1&0\\0&1&0&1\\0&1&0&1\end{array}\right)
;\qquad\matr
B=\left(\begin{array}{cc}1&0\\1&0\\0&1\\0&1\end{array}\right)
;\qquad\BPsi=
\left(\begin{array}{cccc}
\psi_{\alpha}&0&0&0\\
0&\psi_{\alpha}&0&0\\
0&0&\psi_{\beta}&0\\
0&0&0&\psi_{\beta}
\end{array}\right).
\]
Find the matrix $\matr K^{-1}$ needed to find the posterior of
$\theta$.
\nextq Generalize the theory developed in Section 8.6
to deal with the case where $\vect x\sim\N(\matr A\btheta,\BPhi)$
and $\btheta\sim\N(\matr B\bmu,\BPsi)$ and knowledge of $\bmu$
is vague to deal with the case where
$\bmu\sim\N(\matr C\bnu,\matr K)$\ (Lindley and Smith, 1972).
\nextq Find the elements of the variance-covariance matrix $\BSigma$
for the one way model in the case where $n_i=n$ for all $i$.
\section{Exercises on Chapter \arabic{section}}
\setcounter{qno}{0}
\nextq Find the value of $\int_0^1 \text{e}^x\dx$ by crude Monte Carlo
integration using a sample size of $n=10$ values from a uniform
distribution $\U(0, 1)$ taken from tables of random numbers [use,
for example, groups of random digits from Lindley and Scott (1995,
Table 27) or Neave (1978, Table 8.1)]. Repeat the experiment 10
times and compute the overall mean and the sample standard
deviation of the values you obtain. What is the theoretical
value of the population standard deviation and how does the
value you obtained compare with it?
\nextq Suppose that, in a Markov chain with just two states, the
probabilities of going from state $i$ to state $j$ in one time unit
are given by the entries of the matrix
\[ \matr A=\left(\begin{array}{cc}
\slopefrac{1}{4}&\slopefrac{3}{4}\\ \slopefrac{1}{2}&\slopefrac{1}{2}
\end{array}\right) \]
in which $i$ represents the row and $j$ the column. Show that the
probability of getting from state $i$ to state $j$ in $t$ time
units is given by the $t$th power of the matrix $\matr A$ and that
\[ \matr A^t=\left(\begin{array}{cc}
\slopefrac{2}{5}&\slopefrac{3}{5}\\ \slopefrac{2}{5}&\slopefrac{3}{5}
\end{array}\right)
+\left(\mbox{\large{$-\frac{1}{4}$}}\right)^t
\left(\begin{array}{cc}
\slopefrac{3}{5}&-\slopefrac{3}{5}\\-\slopefrac{2}{5}&\slopefrac{2}{5}
\end{array}\right). \]
Deduce that, irrespective of the state the chain started in, after
a long time it will be in the first state with probability
$\slopefrac{2}{5}$ and in the second state with probability
$\slopefrac{3}{5}$.
\nextq Smith (1969, Section 21.10) quotes an example on genetic linkage
in which we have observations $\vect x=(x_1, x_2, x_3, x_4)$ with
cell probabilities
\[ \left(\quarter+\quarter\eta,\,\quarter\eta,\,
\quarter(1-\eta),\,\quarter(1-\eta)+\quarter\right). \]
The values quoted are $x_1=461$, $x_2=130$, $x_3=161$ and
$x_4=515$. Divide $x_1$ into $y_0$ and $y_1$ and $x_4$
into $y_4$ and $y_5$
to produce augmented data $\vect y=(y_0, y_1, y_2, y_3, y_4, y_5)$
and use the \EM\ al\-gorithm to estimate $\eta$.
\nextq Dempster \textit{et al.}\ (1977) define a generalized \EM\
al\-gorithm (abbreviated as a \GEM\ al\-gorithm) as one in which
$Q(\theta^{(t+1)},\theta^{(t)})\geqslant
Q(\theta^{(t)},\theta^{(t)})$.
Give reasons for believing that \GEM\ al\-gorithms converge to the
posterior mode.
\nextq In question 16 in Chapter 2
we supposed that the results of a certain test were known, on the
basis of general theory, to be normally distributed about the same
mean $\mu$ with the same variance $\phi$, neither of which is
known. In that question we went on to suppose that your prior
beliefs about
$(\mu, \phi)$ could be represented by a normal/chi-squared
distribution with
\[ \nu_0 = 4, \qquad S_0 = 350, \qquad n_0 = 1, \qquad \theta_0 =
85. \]
Find a semi-conjugate prior which has marginal distributions that
are close to the marginal distributions of the normal/chi-squared
prior but is such that the mean and variance are independent
\textit{a priori}. Now suppose as previously that 100 observations
are obtained from the population with mean 89 and sample variance
$s^2 = 30$. Find the posterior distribution of $(\mu, \phi)$.
Compare the posterior mean obtained by the \EM\ al\-gorithm with
that obtained from the fully conjugate prior.
\nextq A textile company weaves a fabric on a large number of looms.
Four looms selected at random from those available, and four
observations of the tensile strength of fabric woven on each of
these looms are available (there is no significance to the order
of the observations from each of the looms), and the resulting
data are given below:
\[
\begin{array}{ccccc}
\text{Loom}&\multicolumn{4}{c}{\qquad\text{Observations}} \\
1 &\qquad 98 \quad&\quad 97 \quad&\quad 99 \quad&\quad 96
\\
2 &\qquad 91 \quad&\quad 90 \quad&\quad 93 \quad&\quad 92
\\
3 &\qquad 96 \quad&\quad 95 \quad&\quad 97 \quad&\quad 95
\\
4 &\qquad 95 \quad&\quad 96 \quad&\quad 99 \quad&\quad 98
\end{array}
\]
Estimate the means for each of the looms, the overall mean, the
variance of observations from the same loom, and the variance of
means from different looms in the population.
\nextq Write computer programs in C++ equivalent to the programs in
\R\ in this chapter.
\nextq Use the data augmentation al\-gorithm to estimate the posterior
density of the parameter $\eta$ in the linkage model in question
3 above.
\nextq Suppose that $y\,|\,\pi\sim\B(n,\pi)$ and
$\pi\,|\,y\sim\Be(y+\alpha,\,n-y+\beta)$ where $n$ is a Poisson
variable of mean $\lambda$ as opposed to being fixed as in Section
9.4. Use the Gibbs sampler (chained data augmentation)
to find the unconditional distribution of $n$ in the case where
$\lambda=16$. $\alpha=2$ and $\beta=4$ (cf.\ Casella and George,
1992).
\nextq Find the mean and variance of the posterior distribution of
$\theta$ for the data in question 5 above using the
prior you derived in answer to that question by means of the Gibbs
sampler (chained data augmentation).
\nextq The data below represent the weights of $r=30$ young rats
measured weekly for $n=5$ weeks as quoted by Gelfand
\textit{et al.}\ (1990), Tanner (1996, Table 1.3 and Section
6.2.1), Carlin and Louis (2000, Example 5.6):
{\catcode`?=\active
\def?{\kern\digitwidth}
\small
\[
\begin{array}{cccccccccccc}
\text{Rat$\backslash$Week}&1&2&3&4&5&\text{Rat$\backslash$Week}&1&2&3&4&5
\\
?1\qquad& 151&199&246&283&320& 16\qquad& 160&207&248&288&324 \\
?2\qquad& 145&199&249&293&354& 17\qquad& 142&187&234&280&316 \\
?3\qquad& 147&214&263&312&328& 18\qquad& 156&203&243&283&317 \\
?4\qquad& 155&200&237&272&297& 19\qquad& 157&212&259&307&336 \\
?5\qquad& 135&188&230&280&323& 20\qquad& 152&203&246&286&321 \\
?6\qquad& 159&210&252&298&331& 21\qquad& 154&205&253&298&334 \\
?7\qquad& 141&189&231&275&305& 22\qquad& 139&190&225&267&302 \\
?8\qquad& 159&201&248&297&338& 23\qquad& 146&191&229&272&302 \\
?9\qquad& 177&236&285&340&376& 24\qquad& 157&211&250&285&323 \\
10\qquad& 134&182&220&260&296& 25\qquad& 132&185&237&286&331 \\
11\qquad& 160&208&261&313&352& 26\qquad& 160&207&257&303&345 \\
12\qquad& 143&188&220&273&314& 27\qquad& 169&216&261&295&333 \\
13\qquad& 154&200&244&289&325& 28\qquad& 157&205&248&289&316 \\
14\qquad& 171&221&270&326&358& 29\qquad& 137&180&219&258&291 \\
15\qquad& 163&216&242&281&312& 30\qquad& 153&200&244&286&324
\end{array}
\]
}
The weight of the $i$th rat in week $j$ is denoted $x_{ij}$ and
we suppose that weight growth is linear, that is,
\[ x_{ij}\sim\N(\alpha_i+\beta_i j,\,\phi), \]
but that the slope and intercept vary from rat to rat. We further
suppose that $\alpha_i$ and $\beta_i$ have a bivariate normal
distribution, so that
\[ \btheta_i=\left(\begin{array}{c}\alpha_i\\
\beta_i\end{array}\right)
\sim\N(\btheta_0,\,\BSigma)\quad\text{where}\quad
\btheta_0=\left(\begin{array}{c}\alpha_0\\
\beta_0\end{array}\right),
\]
and thus we have a random effects model. At the third stage, we
suppose that
\[ p(\matr V\,|\,\nu, \BOmega)\propto
\frac{|\matr V|^{(\nu-k-1)/2}}{|\BOmega|^{\nu/2}}
\exp\left[-\half\text{Trace}(\BOmega^{-1}\matr V)\right]. \]
Methods of sampling from this distribution are described in Odell
and Feiveson (1966), Kennedy and Gentle (1990, Section 6.5.10) and
Gelfand \textit{et al.}\ (1990). [This example was omitted from
the main text because we have avoided use of the Wishart
distribution elsewhere in the book. A slightly simpler model
in which $\BSigma$ is assumed to be diagonal is to be found as
the example `Rats' distributed with WinBUGS.]
Explain in detail how you would use the Gibbs sampler to estimate
the posterior distributions of $\alpha_0$ and $\beta_0$, and if
possible carry out this procedure.
\nextq Use the Met\-ropo\-lis-Hast\-ings al\-gorithm to estimate the
posterior density of the parameter $\eta$ in the linkage model in
Sections 9.2 and 9.3 using candidate values
generated from a uniform distribution on $(0, 1)$ [cf.\ Tanner
(1996, Section 6.5.2)].
\nextq Write a WinBUGS program to analyze the data on wheat yield
considered towards the end of Section 2.13 and in Section 9.3.
\nextq In bioassays the response may vary with a covariate termed the
\textit{dose}. A typical example involving a binary response is
given in the table below, where $R$ is the number of beetles
killed after five hours exposure to gaseous carbon disulphide at
various concentrations (data from Bliss, 1935, quoted by Dobson,
2002, Example 7.3.1).
\begin{center}
\begin{tabular}{lll}
\hline
\multicolumn{1}{c}{Dose $x_i$} & Number of & Number \\
($\text{log}_{10} \text{CS}_2 \text{mg l}^{-2})$
& insects, $n_i$ & killed, $r_i$ \\
\hline
\qquad 1.6907 & \quad 59 & \quad \phantom{1}6 \\
\qquad 1.7242 & \quad 60 & \quad 13 \\
\qquad 1.7552 & \quad 62 & \quad 18 \\
\qquad 1.7842 & \quad 56 & \quad 28 \\
\qquad 1.8113 & \quad 63 & \quad 52 \\
\qquad 1.8369 & \quad 59 & \quad 53 \\
\qquad 1.8610 & \quad 62 & \quad 61 \\
\qquad 1.8839 & \quad 60 & \quad 60 \\
\hline
\end{tabular}
\end{center}
\noindent
Fit a logistic regression model and plot the proportion killed against
dose and the fitted line.
\section{Exercises on Chapter \arabic{section}}
\startqs
\nextq Suppose that $x \sim \C(0,1)$ has a Cauchy distribution. It is
easily shown that $\eta = \Pr(x > 2) = \tan^{-1}(\half)/\pi =
0.147\,583\,6$, but we will consider Monte Carlo methods of
evaluating this probability.
\begin{description}
\item[\quad(a)] Show that if $k$ is the number of values taken
from a random sample of size $n$ with a Cauchy distribution,
then $k/n$ is an estimate with variance $0.125\,802\,7/n$.
\item[\quad(b)] Let $p(x)=2/x^2$ so that $\int_x^{\infty}p(\xi)
\dxi = 2/x$. Show that if $x \sim \U(0,1)$ is uniformly
distributed over the unit interval then $y=2/x$ has the
density $p(x)$ and that all values of $y$ satisfy $y\geqslant
2$ and hence that
\[ \sum_{i=1}^n \frac{1}{2\pi}\frac{y_i^2}{1+y_i^2} \]
gives an estimate of $\eta$ by importance sampling.
\item[\quad(c)] Deduce that if $x_1$, $x_2$, \dots, $x_n$ are
independent $\U(0,1)$ variates then
\[ \widehat\eta =
\frac{1}{n}\sum_{i=1}^n\frac{1}{2\pi}\frac{4}{4+x_i^2} \]
gives an estimate of $\eta$.
\item[\quad(d)] Check that $\widehat\eta$ is an unbiased
estimate of $\eta$ and show that
\[ \E\widehat\eta^2 = \frac{\tan^{-1}(\half)+\twofifths}{4\pi^2} \]
and deduce that
\[ \Var\widehat\eta = 0.000\,095\,5 \]
so that this estimator has a notably smaller variance than
the estimate considered in (a).
\end{description}
\nextq Apply sampling importance resampling starting from random
variables uniformly distributed over $(0,1)$ to estimate
the mean and variance of a beta distribution $\Be(2,3)$.
\nextq Use the sample found in the previous section to find a 90\% HDR
for $\Be(2,3)$ and compare the resultant limits with the values
found using the methodology of Section 3.1. Why do the values
differ?
\nextq Apply the methodology used in the numerical example in Section
\ref{sec:varbayes} to the dataset used in both Exercise 16 on
Chapter 2 and Exercise 5 on Chapter 9.
\nextq Find the Kullback-Leibler divergence $\I(q:p)$ when $p$ is a
binomial distribution $\B(n,\pi)$ and $q$ is a binomial
distribution $\B(n,\rho)$. When does $\I(q:p)=\I(p:q)$?
\nextq Find the Kullback-Leibler divergence $\I(q:p)$ when $p$ is a
normal distribution $\N(\mu,\phi)$ and $q$ is a normal
distribution $\N(\nu,\psi)$.
\nextq Let $p$ be the density $2(2\pi)^{-1/2}\exp(-\half x^2)$ $(x>0)$
of the modulus $|z|$ of a standard normal distribution and let
$q$ be the density $\beta^{-1}\exp(-x/\beta)$ $(x>0)$ of an
$\Ex(\beta)$ distribution. Find the value of $\beta$ such
that $q$ is as close an approximation to $p$ as possible in
the sense that $\I(q:p)$ is a minimum.
\nextq The paper by Corduneaunu and Bishop (2001) referred to in Section
\ref{sec:varbayesgeneral} can be found on the web at
{\small
\[
\texttt{http://research.microsoft.com/pubs/67239/bishop-aistats01.pdf.}
\]
}
\!\!H\"ardle's data set is available in \R\ by going
\texttt{data(faithful)}. Fill in the details of the analysis of
a mixture of multivariate normals given in that section.
\nextq Carry out the calculations in Section 10.4 for the genetic
linkage data quoted by Smith which was given in Exercise 3
on Chapter 9.
\nextq A group of $n$ students sit two exams. Exam one is
on history and exam two is on chemistry. Let $x_i$ and $y_i$
denote the $i$th student's score in the history and chemistry
exams, respectively. The following linear regression model is
proposed for the relationship between the two exam scores:
\[ y_i = \alpha + \beta x_i + \varepsilon_i\quad (i = 1, 2, \dots, n) \]
where $\varepsilon_i \sim \N(0,1/\tau)$.
Assume that $\alpha$, $\beta$ and $\tau$ are unknown parameters
to be estimated and $\vect x = (x_1, x_2, \dots, x_n)$ and
$\vect y = (y_1, y_2, \dots, y_n)$.
Describe a reversible jump \MCMC\ algorithm including discussion
of the acceptance probability, to move between the four competing
models:
\begin{enumerate}
\item $y_i = \alpha + \varepsilon_i$;
\item $y_i = \alpha + \beta x_i + \varepsilon_i$;
\item $y_i = \alpha + \lambda t_i + \varepsilon_i$;
\item $y_i = \alpha + \beta x_i + \lambda t_i + \varepsilon_i$.
\end{enumerate}
Note that if $z$ is a random variable with probability density
function $f$ given by
\[ f(z) \propto \exp\left(-\half A\left(z^2-2Bz\right)\right), \]
then $z \sim N(B,1/A)$ [due to P.~Neal].
\end{document}
%