scientificComputing/statistics-fabian/lecture_statistics03.tex

\documentclass{beamer}
\usepackage{xcolor}
\usepackage{listings}
\usepackage{pgf}
%\usepackage{pgf,pgfarrows,pgfnodes,pgfautomata,pgfheaps,pgfshade}
%\usepackage{multimedia}
\usepackage[latin1]{inputenc}
\usepackage{amsmath}
\usepackage{bm}
\usepackage[T1]{fontenc}
\usepackage{hyperref}
\usepackage{ulem}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\mode<presentation>
{
  \usetheme{Singapore}
  \setbeamercovered{opaque}
  \usecolortheme{tuebingen}
  \setbeamertemplate{navigation symbols}{}
  \usefonttheme{default}
  \useoutertheme{infolines}
  % \useoutertheme{miniframes}
}

\AtBeginSubsection[]
{
  \begin{frame}<beamer>
    \begin{center}
      \Huge \insertsectionhead
    \end{center}
    \tableofcontents[
    currentsubsection,
    hideothersubsections,
    sectionstyle=show/hide,
    subsectionstyle=show/shaded,
]
    % \frametitle{\insertsectionhead}
  \end{frame}
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5

\setbeamertemplate{blocks}[rounded][shadow=true]

\title[]{Scientific Computing -- Statistics}
\author[Statistics]{Fabian Sinz\\Dept. Neuroethology,
  University T\"ubingen\\
Bernstein Center T\"ubingen}

\institute[Scientific Computing]{}
 \date{10/22/2014}
%\logo{\pgfuseimage{logo}}

\subject{Lectures}

%%%%%%%%%% configuration for code
\lstset{
 basicstyle=\ttfamily,
 numbers=left,
 showstringspaces=false,
 language=Matlab,
 commentstyle=\itshape\color{darkgray},
 keywordstyle=\color{blue},
 stringstyle=\color{green},
 backgroundcolor=\color{blue!10},
 breaklines=true,
 breakautoindent=true,
 columns=flexible,
 frame=single,
 captionpos=b,
 xleftmargin=1em,
 xrightmargin=1em,
 aboveskip=10pt
 }
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\newcommand{\mycite}[1]{
\begin{flushright}
\tiny \color{black!80} #1
\end{flushright}
}

\input{../latex/environments.tex}
\makeatother

\begin{document}

\begin{frame}
  \titlepage

\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Day 3  -- study design: choosing n}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{choosing n for confidence intervals}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
  \frametitle{general theme}
  \begin{enumerate}
  \item make an educated guess about the true parameters
  \item state how accurate/powerful you want to be
  \item select $n$ based on that
  \end{enumerate}
\end{frame}

\begin{frame}
  \frametitle{estimating a single mean}
  \framesubtitle{standard error and $\alpha$}
  \begin{itemize}
  \item Assume you have an estimate $s$ of the standard deviation from
    the literature.
  \item The $95$\% confidence interval is given by
    $$\underbrace{|\tilde\mu - \mu_0|}_{=:\delta} \ge   t_{97.5\%,
      \nu}\frac{s}{\sqrt{n}}$$\pause
  \item How should we choose $n$ to get a confidence interval of a
    particular size $\pm \delta$?\pause
  \item[] We should set $n$ to be
    $$n \ge \left(\frac{t_{97.5\%, \nu}\cdot s}{\delta}\right)^2 $$
  \end{itemize}


\end{frame}

\begin{frame}
  \frametitle{exercise}
  \begin{task}{choosing $n$}
      Example from last lecture: Literature value of thymus gland
      weights is $34.3$g. The estimate of the standard deviation from
      the literature is $s=10$g.

      The equation for $n$ is
      $$n \ge \left(\frac{t_{97.5\%, \nu}\cdot s}{\delta}\right)^2 $$
    \begin{itemize}
    \item Assume we want to sacrifice as few animals as possible. We
      say we are fine with a confidence interval of size $\pm\delta=5$, how
      should we choose $n$?
    \item What $n$ should we choose for $n$ if we want $\pm\delta=2$?
    \end{itemize}
    Extend your bootstrapping script from yesterday to check that the
    equation is correct.
  \end{task}
\end{frame}

\begin{frame}[fragile]
  \frametitle{How to interrupt for/while loops}
  \begin{itemize}
  \item Sometimes you want to stop a for/while loop early.
  \item The command for that is {\tt break}
  \end{itemize}
{\bf Example}
\begin{lstlisting}
% silly way to find a random number larger than .8
for i = 1:2000
   u = rand();
   if u >= .8
     disp('Found it!');
     break
   end
end
\end{lstlisting}
\end{frame}


\begin{frame}
  \frametitle{winner's curse}
  \begin{task}{Why it is important to estimate $n$ beforehand}
    Use the thymus gland dataset to repeat the following procedure
    \begin{enumerate}
    \item Randomly select $n=10$ numbers from the whole dataset.
    \item Perform a one-sample ttest ({\tt ttest}) to test against the
      mean of $34.3$g.
    \item If the p-value is smaller than $0.05$, stop the loop and
      print the mean of the $10$ datapoints. Also print the mean of
      the entire thymus gland dataset.
    \item Why is it better to use a {\tt for} instead of a {\tt while} loop?
    \item What can you observe? Why does that tell you that choosing
      $n$ is important?
    \end{enumerate}
  \end{task}
\end{frame}

\begin{frame}[fragile]
  \frametitle{solution}
\scriptsize
  \begin{lstlisting}
load thymusglandweights.dat

n = 10;

x = thymusglandweights;

for i = 1:5000
    idx = randi(length(x), n,1);
    y = x(idx);
    [h,p] = ttest(y, 34.3);

    if h == 1
        disp(['p-value: ', num2str(p)]);
        disp(['mu: ', num2str(mean(y))]);
        disp(['mu total: ', num2str(mean(x))]);
        break
    end
end
  \end{lstlisting}
\end{frame}

\subsection{power}

\begin{frame}
  \frametitle{test nomenclature}
  \begin{center}
    \only<1>{\includegraphics[width=\linewidth]{figs/testframework00.pdf}}
    \only<2>{\includegraphics[width=\linewidth]{figs/testframework01.pdf}}
  \end{center}
\small
\begin{columns}
  \begin{column}[l]{.5\linewidth}
{\bf You want:}
\begin{itemize}
\item large power
\item small type I \& II error probability ($\alpha$ and $\beta$)
\end{itemize}
  \end{column}
  \begin{column}[r]{.5\linewidth}
  \end{column}
\end{columns}
\end{frame}

\begin{frame}
  \frametitle{power}
  \begin{task}{estimating power with bootstrapping}
    \begin{itemize}
    \item Take the script from yesterday in which we simulated the
      null distribution of the means.
    \item Extend it such that it plots the bootstrapped distribution
      of the means as well (use the same bins for both histograms by
      using {\tt hist} for computing the histogram and {\tt bar} for
      plotting).
    \item Use logical indexing to find all means that correspond to
      true positives (using the 95\% decision boundaries computed
      yesterday). Estimate the power by computing the fraction of true
      positive bootstrapped means.
    \item What is the probability that you get a false negative?
    \item If you have time, plot the histogram of true positives in a
      different color.
    \end{itemize}
  \end{task}
\end{frame}

\begin{frame}
  \frametitle{summary}
  \begin{itemize}
  \item Proper study design is important to avoid statistical problems
    like the winner's curse.
  \item You should choose a test with high power.
  \item There are also equations to select $n$ for type I error {\em
      and} power (see book by Zar).
  \end{itemize}
\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Day 4-5 -- curve fitting and maximum likelihood}
\begin{frame}
  \frametitle{Overview}
  \begin{itemize}
  \item minimizing/maximizing a function numerically (optimization) is
    ubiquitous in science (curve fitting, maximum likelihood, ...)
  \item today we will look at the basic elements of optimization and
    apply it to curve fitting
  \item tomorrow, we will apply it to maximum likelihood
  \end{itemize}
\end{frame}

\begin{frame}[fragile]
  \frametitle{plotting surfaces}
\begin{lstlisting}
range = linspace(-1,1,20);
[X,Y] = meshgrid(range, range);

surf(X,Y, (X.^2 + Y.^2));
colormap('winter');

\end{lstlisting}
\end{frame}

\begin{frame}
  \frametitle{linear least squares}
  \begin{minipage}{1.0\linewidth}
    \begin{minipage}{0.3\linewidth}
      \includegraphics[width=\linewidth]{figs/leastsquares.png}
      \source{http://en.wikipedia.org/wiki/Linear\_least_squares\_\%28mathematics\%29}
    \end{minipage}
    \begin{minipage}{0.7\linewidth}
      \begin{itemize}
      \item The most common curve fitting problem is {\em linear least
        squares}.
      \item Its goal is to predict a set of output values $y_1, ...,
        y_n$ from their corresponding input values $x_1,...,x_n$ with
        a line $f_{a,b}(x) = a x+b$.
      \item How is the line chosen?\pause
      \item[] By minimization of the mean squared error
        $$g(a,b) = \sum_{i=1}^n (y_i - f_{a,b}(x_i))^2$$
      \end{itemize}
    \end{minipage}
  \end{minipage}
\end{frame}

\begin{frame}
  \frametitle{error surface}
  \begin{task}{plotting the error surface}
    Write a function {\tt lserr} that takes 2-dimensional parameter
    vector (slope and offset), an array of inputs {\tt x}, and an
    array of corresponding outputs {\tt y}.
  \end{task}
\end{frame}

\begin{frame}
  \begin{center}
    \Huge That's it.
  \end{center}
\end{frame}

\end{document}