This repository has been archived on 2021-05-17. You can view files and clone it, but cannot push or open issues or pull requests.
scientificComputing/statistics-fabian/lecture_statistics03.tex

330 lines
8.8 KiB
TeX

\documentclass{beamer}
\usepackage{xcolor}
\usepackage{listings}
\usepackage{pgf}
%\usepackage{pgf,pgfarrows,pgfnodes,pgfautomata,pgfheaps,pgfshade}
%\usepackage{multimedia}
\usepackage[latin1]{inputenc}
\usepackage{amsmath}
\usepackage{bm}
\usepackage[T1]{fontenc}
\usepackage{hyperref}
\usepackage{ulem}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\mode<presentation>
{
\usetheme{Singapore}
\setbeamercovered{opaque}
\usecolortheme{tuebingen}
\setbeamertemplate{navigation symbols}{}
\usefonttheme{default}
\useoutertheme{infolines}
% \useoutertheme{miniframes}
}
\AtBeginSubsection[]
{
\begin{frame}<beamer>
\begin{center}
\Huge \insertsectionhead
\end{center}
\tableofcontents[
currentsubsection,
hideothersubsections,
sectionstyle=show/hide,
subsectionstyle=show/shaded,
]
% \frametitle{\insertsectionhead}
\end{frame}
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
\setbeamertemplate{blocks}[rounded][shadow=true]
\title[]{Scientific Computing -- Statistics}
\author[Statistics]{Fabian Sinz\\Dept. Neuroethology,
University T\"ubingen\\
Bernstein Center T\"ubingen}
\institute[Scientific Computing]{}
\date{10/22/2014}
%\logo{\pgfuseimage{logo}}
\subject{Lectures}
%%%%%%%%%% configuration for code
\lstset{
basicstyle=\ttfamily,
numbers=left,
showstringspaces=false,
language=Matlab,
commentstyle=\itshape\color{darkgray},
keywordstyle=\color{blue},
stringstyle=\color{green},
backgroundcolor=\color{blue!10},
breaklines=true,
breakautoindent=true,
columns=flexible,
frame=single,
captionpos=b,
xleftmargin=1em,
xrightmargin=1em,
aboveskip=10pt
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newcommand{\mycite}[1]{
\begin{flushright}
\tiny \color{black!80} #1
\end{flushright}
}
\input{../latex/environments.tex}
\makeatother
\begin{document}
\begin{frame}
\titlepage
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Day 3 -- study design: choosing n}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{choosing n for confidence intervals}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{general theme}
\begin{enumerate}
\item make an educated guess about the true parameters
\item state how accurate/powerful you want to be
\item select $n$ based on that
\end{enumerate}
\end{frame}
\begin{frame}
\frametitle{estimating a single mean}
\framesubtitle{standard error and $\alpha$}
\begin{itemize}
\item Assume you have an estimate $s$ of the standard deviation from
the literature.
\item The $95$\% confidence interval is given by
$$\underbrace{|\tilde\mu - \mu_0|}_{=:\delta} \ge t_{97.5\%,
\nu}\frac{s}{\sqrt{n}}$$\pause
\item How should we choose $n$ to get a confidence interval of a
particular size $\pm \delta$?\pause
\item[] We should set $n$ to be
$$n \ge \left(\frac{t_{97.5\%, \nu}\cdot s}{\delta}\right)^2 $$
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{exercise}
\begin{task}{choosing $n$}
Example from last lecture: Literature value of thymus gland
weights is $34.3$g. The estimate of the standard deviation from
the literature is $s=10$g.
The equation for $n$ is
$$n \ge \left(\frac{t_{97.5\%, \nu}\cdot s}{\delta}\right)^2 $$
\begin{itemize}
\item Assume we want to sacrifice as few animals as possible. We
say we are fine with a confidence interval of size $\pm\delta=5$, how
should we choose $n$?
\item What $n$ should we choose for $n$ if we want $\pm\delta=2$?
\end{itemize}
Extend your bootstrapping script from yesterday to check that the
equation is correct.
\end{task}
\end{frame}
\begin{frame}[fragile]
\frametitle{How to interrupt for/while loops}
\begin{itemize}
\item Sometimes you want to stop a for/while loop early.
\item The command for that is {\tt break}
\end{itemize}
{\bf Example}
\begin{lstlisting}
% silly way to find a random number larger than .8
for i = 1:2000
u = rand();
if u >= .8
disp('Found it!');
break
end
end
\end{lstlisting}
\end{frame}
\begin{frame}
\frametitle{winner's curse}
\begin{task}{Why it is important to estimate $n$ beforehand}
Use the thymus gland dataset to repeat the following procedure
\begin{enumerate}
\item Randomly select $n=10$ numbers from the whole dataset.
\item Perform a one-sample ttest ({\tt ttest}) to test against the
mean of $34.3$g.
\item If the p-value is smaller than $0.05$, stop the loop and
print the mean of the $10$ datapoints. Also print the mean of
the entire thymus gland dataset.
\item Why is it better to use a {\tt for} instead of a {\tt while} loop?
\item What can you observe? Why does that tell you that choosing
$n$ is important?
\end{enumerate}
\end{task}
\end{frame}
\begin{frame}[fragile]
\frametitle{solution}
\scriptsize
\begin{lstlisting}
load thymusglandweights.dat
n = 10;
x = thymusglandweights;
for i = 1:5000
idx = randi(length(x), n,1);
y = x(idx);
[h,p] = ttest(y, 34.3);
if h == 1
disp(['p-value: ', num2str(p)]);
disp(['mu: ', num2str(mean(y))]);
disp(['mu total: ', num2str(mean(x))]);
break
end
end
\end{lstlisting}
\end{frame}
\subsection{power}
\begin{frame}
\frametitle{test nomenclature}
\begin{center}
\only<1>{\includegraphics[width=\linewidth]{figs/testframework00.pdf}}
\only<2>{\includegraphics[width=\linewidth]{figs/testframework01.pdf}}
\end{center}
\small
\begin{columns}
\begin{column}[l]{.5\linewidth}
{\bf You want:}
\begin{itemize}
\item large power
\item small type I \& II error probability ($\alpha$ and $\beta$)
\end{itemize}
\end{column}
\begin{column}[r]{.5\linewidth}
\end{column}
\end{columns}
\end{frame}
\begin{frame}
\frametitle{power}
\begin{task}{estimating power with bootstrapping}
\begin{itemize}
\item Take the script from yesterday in which we simulated the
null distribution of the means.
\item Extend it such that it plots the bootstrapped distribution
of the means as well (use the same bins for both histograms by
using {\tt hist} for computing the histogram and {\tt bar} for
plotting).
\item Use logical indexing to find all means that correspond to
true positives (using the 95\% decision boundaries computed
yesterday). Estimate the power by computing the fraction of true
positive bootstrapped means.
\item What is the probability that you get a false negative?
\item If you have time, plot the histogram of true positives in a
different color.
\end{itemize}
\end{task}
\end{frame}
\begin{frame}
\frametitle{summary}
\begin{itemize}
\item Proper study design is important to avoid statistical problems
like the winner's curse.
\item You should choose a test with high power.
\item There are also equations to select $n$ for type I error {\em
and} power (see book by Zar).
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Day 4-5 -- curve fitting and maximum likelihood}
\begin{frame}
\frametitle{Overview}
\begin{itemize}
\item minimizing/maximizing a function numerically (optimization) is
ubiquitous in science (curve fitting, maximum likelihood, ...)
\item today we will look at the basic elements of optimization and
apply it to curve fitting
\item tomorrow, we will apply it to maximum likelihood
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{plotting surfaces}
\begin{lstlisting}
range = linspace(-1,1,20);
[X,Y] = meshgrid(range, range);
surf(X,Y, (X.^2 + Y.^2));
colormap('winter');
\end{lstlisting}
\end{frame}
\begin{frame}
\frametitle{linear least squares}
\begin{minipage}{1.0\linewidth}
\begin{minipage}{0.3\linewidth}
\includegraphics[width=\linewidth]{figs/leastsquares.png}
\source{http://en.wikipedia.org/wiki/Linear\_least_squares\_\%28mathematics\%29}
\end{minipage}
\begin{minipage}{0.7\linewidth}
\begin{itemize}
\item The most common curve fitting problem is {\em linear least
squares}.
\item Its goal is to predict a set of output values $y_1, ...,
y_n$ from their corresponding input values $x_1,...,x_n$ with
a line $f_{a,b}(x) = a x+b$.
\item How is the line chosen?\pause
\item[] By minimization of the mean squared error
$$g(a,b) = \sum_{i=1}^n (y_i - f_{a,b}(x_i))^2$$
\end{itemize}
\end{minipage}
\end{minipage}
\end{frame}
\begin{frame}
\frametitle{error surface}
\begin{task}{plotting the error surface}
Write a function {\tt lserr} that takes 2-dimensional parameter
vector (slope and offset), an array of inputs {\tt x}, and an
array of corresponding outputs {\tt y}.
\end{task}
\end{frame}
\begin{frame}
\begin{center}
\Huge That's it.
\end{center}
\end{frame}
\end{document}