scientificComputing/bootstrap/exercises/resampling-1.tex

\documentclass[12pt,a4paper,pdftex]{exam}

\newcommand{\exercisetopic}{Resampling}
\newcommand{\exercisenum}{8}
\newcommand{\exercisedate}{December 14th, 2020}

\input{../../exercisesheader}

\firstpagefooter{Prof. Dr. Jan Benda}{}{jan.benda@uni-tuebingen.de}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{document}

\input{../../exercisestitle}

\begin{questions}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\question \qt{Read chapter 7 of the script on ``resampling methods''!}\vspace{-3ex}

\question \qt{Bootstrap the standard error of the mean}
We want to compute the standard error of the mean of a data set by
means of the bootstrap method and compare the result with the formula
``standard deviation divided by the square-root of $n$''.
\begin{parts}
  \part Download the file \code{thymusglandweights.dat} from Ilias.
  This is a data set of the weights of the thymus glands of 14-day old chicken embryos
  measured in milligram.
  \part Load the data into Matlab (\code{load} function).
  \part Compute histogram, mean, and standard error of the mean of the first 80 data points.
  \part Compute the standard error of the mean of the first 80 data
  points by bootstrapping the data 500 times. Write a function that
  bootstraps the standard error of the mean of a given data set. The
  function should also return a vector with the bootstrapped means.
  \part Compute the 95\,\% confidence interval for the mean from the
  bootstrap distribution (\code{quantile()} function) --- the
  interval that contains the true mean with 95\,\% probability.
  \part Use the whole data set and the bootstrap method for computing
  the dependence of the standard error of the mean from the sample
  size $n$.
  \part Compare your result with the formula for the standard error
  $\sigma/\sqrt{n}$.
\end{parts}
\begin{solution}
  \lstinputlisting{bootstrapmean.m}
  \lstinputlisting{bootstraptymus.m}
  \includegraphics[width=0.5\textwidth]{bootstraptymus-datahist}
  \includegraphics[width=0.5\textwidth]{bootstraptymus-meanhist}
  \includegraphics[width=0.5\textwidth]{bootstraptymus-samples}
\end{solution}


\question \qt{Student t-distribution}
The distribution of Student's t, $t=\bar x/(\sigma_x/\sqrt{n})$, the
estimated mean $\bar x$ of a data set of size $n$ divided by the
estimated standard error of the mean $\sigma_x/\sqrt{n}$, where
$\sigma_x$ is the estimated standard deviation, is not a normal
distribution but a Student-t distribution.  We want to compute the
Student-t distribution and compare it with the normal distribution.
\begin{parts}
\part Generate 100000 normally distributed random numbers.
\part Draw from these data 1000 samples of size $n=3$, 5, 10, and
50. For each sample size $n$ ...
\part ... compute the mean $\bar x$ of the samples and plot the
probability density of these means.
\part ... compare the resulting probability densities with corresponding
normal distributions.
\part ... compute Student's $t=\bar x/(\sigma_x/\sqrt{n})$ and compare its
distribution with the normal distribution with standard deviation of
one. Is $t$ normally distributed? Under which conditions is $t$
normally distributed?
\end{parts}
\newsolutionpage
\begin{solution}
  \lstinputlisting{tdistribution.m}
  \includegraphics[width=1\textwidth]{tdistribution-n03}\\
  \includegraphics[width=1\textwidth]{tdistribution-n05}\\
  \includegraphics[width=1\textwidth]{tdistribution-n10}\\
  \includegraphics[width=1\textwidth]{tdistribution-n50}
\end{solution}


\continue
\question \qt{Permutation test of correlations} \label{correlationtest}
We want to compute the significance of a correlation by means of a permutation test.
\begin{parts}
  \part \label{correlationtestdata} Generate 1000 correlated pairs
  $x$, $y$ of random numbers according to:
\begin{verbatim}
n = 1000
a = 0.2;
x = randn(n, 1);
y = randn(n, 1) + a*x;
\end{verbatim}
  \part Generate a scatter plot of the two variables.
  \part Why is $y$ correlated with $x$?
  \part Compute the correlation coefficient between $x$ and $y$.
  \part What do you need to do in order to destroy the correlations between the $x$-$y$ pairs?
  \part Do exactly this 1000 times and compute each time the correlation coefficient.
  \part Compute and plot the probability density of these correlation
  coefficients.
  \part Is the correlation of the original data set significant?
  \part What does ``significance of the correlation'' mean?
%  \part Vary the sample size \code{n} and compute in the same way the
%  significance of the correlation.
\end{parts}
\begin{solution}
  \lstinputlisting{correlationsignificance.m}
  \includegraphics[width=1\textwidth]{correlationsignificance}
\end{solution}

\question \qt{Bootstrap the correlation coefficient}
The permutation test generates the distribution of the null hypothesis
of uncorrelated data and we check whether the correlation coefficient
of the data differs significantly from this
distribution. Alternatively we can bootstrap the data while keeping
the pairs and determine the confidence interval of the correlation
coefficient of the data. If this differs significantly from a
correlation coefficient of zero we can conclude that the correlation
coefficient of the data indeed quantifies correlated data.

We take the same data set that we have generated in exercise
\ref{correlationtest} (\ref{correlationtestdata}).
\begin{parts}
  \part Bootstrap 1000 times the correlation coefficient from the
  data, i.e.  generate bootstrap data by randomly resampling the
  original data pairs with replacement. Use the \code{randi()}
  function for generating random indices that you can use to select a
  random sample from the original data.
  \part Compute and plot the probability density of these correlation
  coefficients.
  \part Is the correlation of the original data set significant?
\end{parts}
\begin{solution}
  \lstinputlisting{correlationbootstrap.m}
  \includegraphics[width=1\textwidth]{correlationbootstrap}
\end{solution}


\continuepage
\question \qt{Permutation test of difference of means}
We want to test whether two data sets come from distributions that
differ in their mean by means of a permutation test.
\begin{parts}
  \part Generate two normally distributed data sets $x$ and $y$
  containing each $n=200$ samples. Let's assume the $x$ samples are
  measurements of the membrane potential of a mammalian photoreceptor
  in darkness with a mean of $-40$\,mV and a standard deviation of
  1\,mV. The $y$ values are the membrane potentials measured under dim
  illumination and come from a distribution with the same standard
  deviation and a mean of $-40.5$\,mV. See section 5.2 ``Scaling and
  shifting random numbers'' in the script.
  \part Plot histograms of the $x$ and $y$ data in a single
  plot. Choose appropriate bins.
  \part Compute the means of $x$ and $y$ and their difference.
  \part The null hypothesis is that the $x$ and $y$ data come from the
  same distribution. How can you generate new samples $x_r$ and $y_r$
  from the original data that come from the same distribution?
  \part Do exactly this 1000 times and compute each time the
  difference of the means of the two resampled samples.
  \part Compute and plot the probability density of the resulting
  distribution of the null hypothesis.
  \part Is the difference of the means of the original data sets significant?
  \part Repeat this procedure for $y$ samples that are closer or
  further apart from the mean of the $x$ data set. For this put the
  computations of the permuation test in a function and all the plotting
  in another function.
\end{parts}
\begin{solution}
  \lstinputlisting{meandiffpermutation.m}
  \lstinputlisting{meandiffplot.m}
  \lstinputlisting{meandiffsignificance.m}
  \includegraphics[width=1\textwidth]{meandiffsignificance}
\end{solution}

\end{questions}

\end{document}