330 lines
8.8 KiB
TeX
330 lines
8.8 KiB
TeX
\documentclass{beamer}
|
|
\usepackage{xcolor}
|
|
\usepackage{listings}
|
|
\usepackage{pgf}
|
|
%\usepackage{pgf,pgfarrows,pgfnodes,pgfautomata,pgfheaps,pgfshade}
|
|
%\usepackage{multimedia}
|
|
\usepackage[latin1]{inputenc}
|
|
\usepackage{amsmath}
|
|
\usepackage{bm}
|
|
\usepackage[T1]{fontenc}
|
|
\usepackage{hyperref}
|
|
\usepackage{ulem}
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
\mode<presentation>
|
|
{
|
|
\usetheme{Singapore}
|
|
\setbeamercovered{opaque}
|
|
\usecolortheme{tuebingen}
|
|
\setbeamertemplate{navigation symbols}{}
|
|
\usefonttheme{default}
|
|
\useoutertheme{infolines}
|
|
% \useoutertheme{miniframes}
|
|
}
|
|
|
|
\AtBeginSubsection[]
|
|
{
|
|
\begin{frame}<beamer>
|
|
\begin{center}
|
|
\Huge \insertsectionhead
|
|
\end{center}
|
|
\tableofcontents[
|
|
currentsubsection,
|
|
hideothersubsections,
|
|
sectionstyle=show/hide,
|
|
subsectionstyle=show/shaded,
|
|
]
|
|
% \frametitle{\insertsectionhead}
|
|
\end{frame}
|
|
}
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
|
|
|
|
\setbeamertemplate{blocks}[rounded][shadow=true]
|
|
|
|
\title[]{Scientific Computing -- Statistics}
|
|
\author[Statistics]{Fabian Sinz\\Dept. Neuroethology,
|
|
University T\"ubingen\\
|
|
Bernstein Center T\"ubingen}
|
|
|
|
\institute[Scientific Computing]{}
|
|
\date{10/22/2014}
|
|
%\logo{\pgfuseimage{logo}}
|
|
|
|
\subject{Lectures}
|
|
|
|
%%%%%%%%%% configuration for code
|
|
\lstset{
|
|
basicstyle=\ttfamily,
|
|
numbers=left,
|
|
showstringspaces=false,
|
|
language=Matlab,
|
|
commentstyle=\itshape\color{darkgray},
|
|
keywordstyle=\color{blue},
|
|
stringstyle=\color{green},
|
|
backgroundcolor=\color{blue!10},
|
|
breaklines=true,
|
|
breakautoindent=true,
|
|
columns=flexible,
|
|
frame=single,
|
|
captionpos=b,
|
|
xleftmargin=1em,
|
|
xrightmargin=1em,
|
|
aboveskip=10pt
|
|
}
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
|
|
\newcommand{\mycite}[1]{
|
|
\begin{flushright}
|
|
\tiny \color{black!80} #1
|
|
\end{flushright}
|
|
}
|
|
|
|
\input{../latex/environments.tex}
|
|
\makeatother
|
|
|
|
\begin{document}
|
|
|
|
\begin{frame}
|
|
\titlepage
|
|
|
|
\end{frame}
|
|
|
|
|
|
|
|
|
|
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
\section{Day 3 -- study design: choosing n}
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
\subsection{choosing n for confidence intervals}
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
\begin{frame}
|
|
\frametitle{general theme}
|
|
\begin{enumerate}
|
|
\item make an educated guess about the true parameters
|
|
\item state how accurate/powerful you want to be
|
|
\item select $n$ based on that
|
|
\end{enumerate}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{estimating a single mean}
|
|
\framesubtitle{standard error and $\alpha$}
|
|
\begin{itemize}
|
|
\item Assume you have an estimate $s$ of the standard deviation from
|
|
the literature.
|
|
\item The $95$\% confidence interval is given by
|
|
$$\underbrace{|\tilde\mu - \mu_0|}_{=:\delta} \ge t_{97.5\%,
|
|
\nu}\frac{s}{\sqrt{n}}$$\pause
|
|
\item How should we choose $n$ to get a confidence interval of a
|
|
particular size $\pm \delta$?\pause
|
|
\item[] We should set $n$ to be
|
|
$$n \ge \left(\frac{t_{97.5\%, \nu}\cdot s}{\delta}\right)^2 $$
|
|
\end{itemize}
|
|
|
|
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{exercise}
|
|
\begin{task}{choosing $n$}
|
|
Example from last lecture: Literature value of thymus gland
|
|
weights is $34.3$g. The estimate of the standard deviation from
|
|
the literature is $s=10$g.
|
|
|
|
The equation for $n$ is
|
|
$$n \ge \left(\frac{t_{97.5\%, \nu}\cdot s}{\delta}\right)^2 $$
|
|
\begin{itemize}
|
|
\item Assume we want to sacrifice as few animals as possible. We
|
|
say we are fine with a confidence interval of size $\pm\delta=5$, how
|
|
should we choose $n$?
|
|
\item What $n$ should we choose for $n$ if we want $\pm\delta=2$?
|
|
\end{itemize}
|
|
Extend your bootstrapping script from yesterday to check that the
|
|
equation is correct.
|
|
\end{task}
|
|
\end{frame}
|
|
|
|
\begin{frame}[fragile]
|
|
\frametitle{How to interrupt for/while loops}
|
|
\begin{itemize}
|
|
\item Sometimes you want to stop a for/while loop early.
|
|
\item The command for that is {\tt break}
|
|
\end{itemize}
|
|
{\bf Example}
|
|
\begin{lstlisting}
|
|
% silly way to find a random number larger than .8
|
|
for i = 1:2000
|
|
u = rand();
|
|
if u >= .8
|
|
disp('Found it!');
|
|
break
|
|
end
|
|
end
|
|
\end{lstlisting}
|
|
\end{frame}
|
|
|
|
|
|
\begin{frame}
|
|
\frametitle{winner's curse}
|
|
\begin{task}{Why it is important to estimate $n$ beforehand}
|
|
Use the thymus gland dataset to repeat the following procedure
|
|
\begin{enumerate}
|
|
\item Randomly select $n=10$ numbers from the whole dataset.
|
|
\item Perform a one-sample ttest ({\tt ttest}) to test against the
|
|
mean of $34.3$g.
|
|
\item If the p-value is smaller than $0.05$, stop the loop and
|
|
print the mean of the $10$ datapoints. Also print the mean of
|
|
the entire thymus gland dataset.
|
|
\item Why is it better to use a {\tt for} instead of a {\tt while} loop?
|
|
\item What can you observe? Why does that tell you that choosing
|
|
$n$ is important?
|
|
\end{enumerate}
|
|
\end{task}
|
|
\end{frame}
|
|
|
|
\begin{frame}[fragile]
|
|
\frametitle{solution}
|
|
\scriptsize
|
|
\begin{lstlisting}
|
|
load thymusglandweights.dat
|
|
|
|
n = 10;
|
|
|
|
x = thymusglandweights;
|
|
|
|
for i = 1:5000
|
|
idx = randi(length(x), n,1);
|
|
y = x(idx);
|
|
[h,p] = ttest(y, 34.3);
|
|
|
|
if h == 1
|
|
disp(['p-value: ', num2str(p)]);
|
|
disp(['mu: ', num2str(mean(y))]);
|
|
disp(['mu total: ', num2str(mean(x))]);
|
|
break
|
|
end
|
|
end
|
|
\end{lstlisting}
|
|
\end{frame}
|
|
|
|
\subsection{power}
|
|
|
|
\begin{frame}
|
|
\frametitle{test nomenclature}
|
|
\begin{center}
|
|
\only<1>{\includegraphics[width=\linewidth]{figs/testframework00.pdf}}
|
|
\only<2>{\includegraphics[width=\linewidth]{figs/testframework01.pdf}}
|
|
\end{center}
|
|
\small
|
|
\begin{columns}
|
|
\begin{column}[l]{.5\linewidth}
|
|
{\bf You want:}
|
|
\begin{itemize}
|
|
\item large power
|
|
\item small type I \& II error probability ($\alpha$ and $\beta$)
|
|
\end{itemize}
|
|
\end{column}
|
|
\begin{column}[r]{.5\linewidth}
|
|
\end{column}
|
|
\end{columns}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{power}
|
|
\begin{task}{estimating power with bootstrapping}
|
|
\begin{itemize}
|
|
\item Take the script from yesterday in which we simulated the
|
|
null distribution of the means.
|
|
\item Extend it such that it plots the bootstrapped distribution
|
|
of the means as well (use the same bins for both histograms by
|
|
using {\tt hist} for computing the histogram and {\tt bar} for
|
|
plotting).
|
|
\item Use logical indexing to find all means that correspond to
|
|
true positives (using the 95\% decision boundaries computed
|
|
yesterday). Estimate the power by computing the fraction of true
|
|
positive bootstrapped means.
|
|
\item What is the probability that you get a false negative?
|
|
\item If you have time, plot the histogram of true positives in a
|
|
different color.
|
|
\end{itemize}
|
|
\end{task}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{summary}
|
|
\begin{itemize}
|
|
\item Proper study design is important to avoid statistical problems
|
|
like the winner's curse.
|
|
\item You should choose a test with high power.
|
|
\item There are also equations to select $n$ for type I error {\em
|
|
and} power (see book by Zar).
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
\section{Day 4-5 -- curve fitting and maximum likelihood}
|
|
\begin{frame}
|
|
\frametitle{Overview}
|
|
\begin{itemize}
|
|
\item minimizing/maximizing a function numerically (optimization) is
|
|
ubiquitous in science (curve fitting, maximum likelihood, ...)
|
|
\item today we will look at the basic elements of optimization and
|
|
apply it to curve fitting
|
|
\item tomorrow, we will apply it to maximum likelihood
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
\begin{frame}[fragile]
|
|
\frametitle{plotting surfaces}
|
|
\begin{lstlisting}
|
|
range = linspace(-1,1,20);
|
|
[X,Y] = meshgrid(range, range);
|
|
|
|
surf(X,Y, (X.^2 + Y.^2));
|
|
colormap('winter');
|
|
|
|
\end{lstlisting}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{linear least squares}
|
|
\begin{minipage}{1.0\linewidth}
|
|
\begin{minipage}{0.3\linewidth}
|
|
\includegraphics[width=\linewidth]{figs/leastsquares.png}
|
|
\source{http://en.wikipedia.org/wiki/Linear\_least_squares\_\%28mathematics\%29}
|
|
\end{minipage}
|
|
\begin{minipage}{0.7\linewidth}
|
|
\begin{itemize}
|
|
\item The most common curve fitting problem is {\em linear least
|
|
squares}.
|
|
\item Its goal is to predict a set of output values $y_1, ...,
|
|
y_n$ from their corresponding input values $x_1,...,x_n$ with
|
|
a line $f_{a,b}(x) = a x+b$.
|
|
\item How is the line chosen?\pause
|
|
\item[] By minimization of the mean squared error
|
|
$$g(a,b) = \sum_{i=1}^n (y_i - f_{a,b}(x_i))^2$$
|
|
\end{itemize}
|
|
\end{minipage}
|
|
\end{minipage}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\frametitle{error surface}
|
|
\begin{task}{plotting the error surface}
|
|
Write a function {\tt lserr} that takes 2-dimensional parameter
|
|
vector (slope and offset), an array of inputs {\tt x}, and an
|
|
array of corresponding outputs {\tt y}.
|
|
\end{task}
|
|
\end{frame}
|
|
|
|
\begin{frame}
|
|
\begin{center}
|
|
\Huge That's it.
|
|
\end{center}
|
|
\end{frame}
|
|
|
|
\end{document}
|
|
|