\documentclass{beamer} \usepackage{xcolor} \usepackage{listings} \usepackage{pgf} %\usepackage{pgf,pgfarrows,pgfnodes,pgfautomata,pgfheaps,pgfshade} %\usepackage{multimedia} \usepackage[latin1]{inputenc} \usepackage{amsmath} \usepackage{bm} \usepackage[T1]{fontenc} \usepackage{hyperref} \usepackage{ulem} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \mode { \usetheme{Singapore} \setbeamercovered{opaque} \usecolortheme{tuebingen} \setbeamertemplate{navigation symbols}{} \usefonttheme{default} \useoutertheme{infolines} % \useoutertheme{miniframes} } \AtBeginSubsection[] { \begin{frame} \begin{center} \Huge \insertsectionhead \end{center} \tableofcontents[ currentsubsection, hideothersubsections, sectionstyle=show/hide, subsectionstyle=show/shaded, ] % \frametitle{\insertsectionhead} \end{frame} } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5 \setbeamertemplate{blocks}[rounded][shadow=true] \title[]{Scientific Computing -- Statistics} \author[Statistics]{Fabian Sinz\\Dept. Neuroethology, University T\"ubingen\\ Bernstein Center T\"ubingen} \institute[Scientific Computing]{} \date{10/22/2014} %\logo{\pgfuseimage{logo}} \subject{Lectures} %%%%%%%%%% configuration for code \lstset{ basicstyle=\ttfamily, numbers=left, showstringspaces=false, language=Matlab, commentstyle=\itshape\color{darkgray}, keywordstyle=\color{blue}, stringstyle=\color{green}, backgroundcolor=\color{blue!10}, breaklines=true, breakautoindent=true, columns=flexible, frame=single, captionpos=b, xleftmargin=1em, xrightmargin=1em, aboveskip=10pt } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newcommand{\mycite}[1]{ \begin{flushright} \tiny \color{black!80} #1 \end{flushright} } \input{../latex/environments.tex} \makeatother \begin{document} \begin{frame} \titlepage \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Day 3 -- study design: choosing n} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{choosing n for confidence intervals} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{general theme} \begin{enumerate} \item make an educated guess about the true parameters \item state how accurate/powerful you want to be \item select $n$ based on that \end{enumerate} \end{frame} \begin{frame} \frametitle{estimating a single mean} \framesubtitle{standard error and $\alpha$} \begin{itemize} \item Assume you have an estimate $s$ of the standard deviation from the literature. \item The $95$\% confidence interval is given by $$\underbrace{|\tilde\mu - \mu_0|}_{=:\delta} \ge t_{97.5\%, \nu}\frac{s}{\sqrt{n}}$$\pause \item How should we choose $n$ to get a confidence interval of a particular size $\pm \delta$?\pause \item[] We should set $n$ to be $$n \ge \left(\frac{t_{97.5\%, \nu}\cdot s}{\delta}\right)^2 $$ \end{itemize} \end{frame} \begin{frame} \frametitle{exercise} \begin{task}{choosing $n$} Example from last lecture: Literature value of thymus gland weights is $34.3$g. The estimate of the standard deviation from the literature is $s=10$g. The equation for $n$ is $$n \ge \left(\frac{t_{97.5\%, \nu}\cdot s}{\delta}\right)^2 $$ \begin{itemize} \item Assume we want to sacrifice as few animals as possible. We say we are fine with a confidence interval of size $\pm\delta=5$, how should we choose $n$? \item What $n$ should we choose for $n$ if we want $\pm\delta=2$? \end{itemize} Extend your bootstrapping script from yesterday to check that the equation is correct. \end{task} \end{frame} \begin{frame}[fragile] \frametitle{How to interrupt for/while loops} \begin{itemize} \item Sometimes you want to stop a for/while loop early. \item The command for that is {\tt break} \end{itemize} {\bf Example} \begin{lstlisting} % silly way to find a random number larger than .8 for i = 1:2000 u = rand(); if u >= .8 disp('Found it!'); break end end \end{lstlisting} \end{frame} \begin{frame} \frametitle{winner's curse} \begin{task}{Why it is important to estimate $n$ beforehand} Use the thymus gland dataset to repeat the following procedure \begin{enumerate} \item Randomly select $n=10$ numbers from the whole dataset. \item Perform a one-sample ttest ({\tt ttest}) to test against the mean of $34.3$g. \item If the p-value is smaller than $0.05$, stop the loop and print the mean of the $10$ datapoints. Also print the mean of the entire thymus gland dataset. \item Why is it better to use a {\tt for} instead of a {\tt while} loop? \item What can you observe? Why does that tell you that choosing $n$ is important? \end{enumerate} \end{task} \end{frame} \begin{frame}[fragile] \frametitle{solution} \scriptsize \begin{lstlisting} load thymusglandweights.dat n = 10; x = thymusglandweights; for i = 1:5000 idx = randi(length(x), n,1); y = x(idx); [h,p] = ttest(y, 34.3); if h == 1 disp(['p-value: ', num2str(p)]); disp(['mu: ', num2str(mean(y))]); disp(['mu total: ', num2str(mean(x))]); break end end \end{lstlisting} \end{frame} \subsection{power} \begin{frame} \frametitle{test nomenclature} \begin{center} \only<1>{\includegraphics[width=\linewidth]{figs/testframework00.pdf}} \only<2>{\includegraphics[width=\linewidth]{figs/testframework01.pdf}} \end{center} \small \begin{columns} \begin{column}[l]{.5\linewidth} {\bf You want:} \begin{itemize} \item large power \item small type I \& II error probability ($\alpha$ and $\beta$) \end{itemize} \end{column} \begin{column}[r]{.5\linewidth} \end{column} \end{columns} \end{frame} \begin{frame} \frametitle{power} \begin{task}{estimating power with bootstrapping} \begin{itemize} \item Take the script from yesterday in which we simulated the null distribution of the means. \item Extend it such that it plots the bootstrapped distribution of the means as well (use the same bins for both histograms by using {\tt hist} for computing the histogram and {\tt bar} for plotting). \item Use logical indexing to find all means that correspond to true positives (using the 95\% decision boundaries computed yesterday). Estimate the power by computing the fraction of true positive bootstrapped means. \item What is the probability that you get a false negative? \item If you have time, plot the histogram of true positives in a different color. \end{itemize} \end{task} \end{frame} \begin{frame} \frametitle{summary} \begin{itemize} \item Proper study design is important to avoid statistical problems like the winner's curse. \item You should choose a test with high power. \item There are also equations to select $n$ for type I error {\em and} power (see book by Zar). \end{itemize} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Day 4-5 -- curve fitting and maximum likelihood} \begin{frame} \frametitle{Overview} \begin{itemize} \item minimizing/maximizing a function numerically (optimization) is ubiquitous in science (curve fitting, maximum likelihood, ...) \item today we will look at the basic elements of optimization and apply it to curve fitting \item tomorrow, we will apply it to maximum likelihood \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{plotting surfaces} \begin{lstlisting} range = linspace(-1,1,20); [X,Y] = meshgrid(range, range); surf(X,Y, (X.^2 + Y.^2)); colormap('winter'); \end{lstlisting} \end{frame} \begin{frame} \frametitle{linear least squares} \begin{minipage}{1.0\linewidth} \begin{minipage}{0.3\linewidth} \includegraphics[width=\linewidth]{figs/leastsquares.png} \source{http://en.wikipedia.org/wiki/Linear\_least_squares\_\%28mathematics\%29} \end{minipage} \begin{minipage}{0.7\linewidth} \begin{itemize} \item The most common curve fitting problem is {\em linear least squares}. \item Its goal is to predict a set of output values $y_1, ..., y_n$ from their corresponding input values $x_1,...,x_n$ with a line $f_{a,b}(x) = a x+b$. \item How is the line chosen?\pause \item[] By minimization of the mean squared error $$g(a,b) = \sum_{i=1}^n (y_i - f_{a,b}(x_i))^2$$ \end{itemize} \end{minipage} \end{minipage} \end{frame} \begin{frame} \frametitle{error surface} \begin{task}{plotting the error surface} Write a function {\tt lserr} that takes 2-dimensional parameter vector (slope and offset), an array of inputs {\tt x}, and an array of corresponding outputs {\tt y}. \end{task} \end{frame} \begin{frame} \begin{center} \Huge That's it. \end{center} \end{frame} \end{document}