diff --git a/statistics/assignments/day3.tex b/statistics/assignments/day3.tex new file mode 100644 index 0000000..5f238bb --- /dev/null +++ b/statistics/assignments/day3.tex @@ -0,0 +1,65 @@ +\documentclass[addpoints,10pt]{exam} +\usepackage{url} +\usepackage{color} +\usepackage{hyperref} + +\pagestyle{headandfoot} +\runningheadrule +\firstpageheadrule + +\firstpageheader{Scientific Computing}{afternoon assignment day 02}{10/22/2014} +%\runningheader{Homework 01}{Page \thepage\ of \numpages}{23. October 2014} +\firstpagefooter{}{}{} +\runningfooter{}{}{} +\pointsinmargin +\bracketedpoints + +%\printanswers +\shadedsolutions + + +\begin{document} +%%%%%%%%%%%%%%%%%%%%% Submission instructions %%%%%%%%%%%%%%%%%%%%%%%%% +\sffamily +%%%%%%%%%%%%%% Questions %%%%%%%%%%%%%%%%%%%%%%%%% + +\begin{questions} + \question When the p-value is small, we reject the null + hypothesis. For example, if you want to test whether two means are + not equal, the null hypothesis is ``means are equal''. If e.g. $p\le + 0.05$ then we take it as sufficient evidence that the null + hypothesis is not true. Therefore, we assume that the means are not + equal (which is what you want to show). + + In this exercise we will look at what kind of p-values we expect if + the null hypothesis is true. In our example, this would be the case + if the true means of two datasets are actually equal. + \begin{parts} + \part Think about how you expect the p-values to behave in that + situation. + \part Simulate the situation in which the means are equal by + repeating the following at least $1000$ times: + \begin{enumerate} + \item Generate two arrays {\tt x} and {\tt y} with $10$ normally + (Gaussian) distributed random numbers using {\tt randn}. By + construction, the true means behind the random number are zero. + \item Perform a two sample t-test ({\tt ttest2}) on {\tt x} and + {\tt y}. Store the p-value. + \end{enumerate} + \part Plot a histogram of the $1000$ p-values. What do you think + is the distribution the p-values (i.e. if you repeated this + experiment many more times, how would the histogram look like)? + \part Given what you find, think about whether the following + strategy is statistically valid: You collect $10$ data points from + each group and perform a test. If the test is not significant, you + collect $10$ more and repeat the test. If the test tells you that + there is a significant difference you stop. Otherwise you repeat + the procedure until the test is significant. + \end{parts} +\end{questions} + + + + + +\end{document} diff --git a/statistics/data/membraneVoltage.mat b/statistics/data/membraneVoltage.mat new file mode 100644 index 0000000..26d9f44 Binary files /dev/null and b/statistics/data/membraneVoltage.mat differ diff --git a/statistics/environments.tex b/statistics/environments.tex new file mode 120000 index 0000000..4741d2f --- /dev/null +++ b/statistics/environments.tex @@ -0,0 +1 @@ +../latex/environments.tex \ No newline at end of file diff --git a/statistics/figs/charging.png b/statistics/figs/charging.png new file mode 100644 index 0000000..2998668 Binary files /dev/null and b/statistics/figs/charging.png differ diff --git a/statistics/figs/experimentalDesign00.pdf b/statistics/figs/experimentalDesign00.pdf new file mode 100644 index 0000000..e05f151 Binary files /dev/null and b/statistics/figs/experimentalDesign00.pdf differ diff --git a/statistics/figs/experimentalDesign01.pdf b/statistics/figs/experimentalDesign01.pdf new file mode 100644 index 0000000..2b26c3d Binary files /dev/null and b/statistics/figs/experimentalDesign01.pdf differ diff --git a/statistics/figs/leastsquares.png b/statistics/figs/leastsquares.png new file mode 100644 index 0000000..e241a55 Binary files /dev/null and b/statistics/figs/leastsquares.png differ diff --git a/statistics/figs/testframework00.pdf b/statistics/figs/testframework00.pdf new file mode 100644 index 0000000..a44f348 Binary files /dev/null and b/statistics/figs/testframework00.pdf differ diff --git a/statistics/figs/testframework01.pdf b/statistics/figs/testframework01.pdf new file mode 100644 index 0000000..65d61ed Binary files /dev/null and b/statistics/figs/testframework01.pdf differ diff --git a/statistics/lecture_statistics02.tex b/statistics/lecture_statistics02.tex index 1ea8246..42586d1 100644 --- a/statistics/lecture_statistics02.tex +++ b/statistics/lecture_statistics02.tex @@ -47,7 +47,7 @@ Bernstein Center T\"ubingen} \institute[Scientific Computing]{} - \date{10/20/2014} + \date{10/21/2014} %\logo{\pgfuseimage{logo}} \subject{Lectures} diff --git a/statistics/lecture_statistics03.tex b/statistics/lecture_statistics03.tex new file mode 100644 index 0000000..cc14c0e --- /dev/null +++ b/statistics/lecture_statistics03.tex @@ -0,0 +1,329 @@ +\documentclass{beamer} +\usepackage{xcolor} +\usepackage{listings} +\usepackage{pgf} +%\usepackage{pgf,pgfarrows,pgfnodes,pgfautomata,pgfheaps,pgfshade} +%\usepackage{multimedia} +\usepackage[latin1]{inputenc} +\usepackage{amsmath} +\usepackage{bm} +\usepackage[T1]{fontenc} +\usepackage{hyperref} +\usepackage{ulem} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\mode +{ + \usetheme{Singapore} + \setbeamercovered{opaque} + \usecolortheme{tuebingen} + \setbeamertemplate{navigation symbols}{} + \usefonttheme{default} + \useoutertheme{infolines} + % \useoutertheme{miniframes} +} + +\AtBeginSubsection[] +{ + \begin{frame} + \begin{center} + \Huge \insertsectionhead + \end{center} + \tableofcontents[ + currentsubsection, + hideothersubsections, + sectionstyle=show/hide, + subsectionstyle=show/shaded, +] + % \frametitle{\insertsectionhead} + \end{frame} +} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5 + +\setbeamertemplate{blocks}[rounded][shadow=true] + +\title[]{Scientific Computing -- Statistics} +\author[Statistics]{Fabian Sinz\\Dept. Neuroethology, + University T\"ubingen\\ +Bernstein Center T\"ubingen} + +\institute[Scientific Computing]{} + \date{10/22/2014} +%\logo{\pgfuseimage{logo}} + +\subject{Lectures} + +%%%%%%%%%% configuration for code +\lstset{ + basicstyle=\ttfamily, + numbers=left, + showstringspaces=false, + language=Matlab, + commentstyle=\itshape\color{darkgray}, + keywordstyle=\color{blue}, + stringstyle=\color{green}, + backgroundcolor=\color{blue!10}, + breaklines=true, + breakautoindent=true, + columns=flexible, + frame=single, + captionpos=b, + xleftmargin=1em, + xrightmargin=1em, + aboveskip=10pt + } +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\newcommand{\mycite}[1]{ +\begin{flushright} +\tiny \color{black!80} #1 +\end{flushright} +} + +\input{../latex/environments.tex} +\makeatother + +\begin{document} + +\begin{frame} + \titlepage + +\end{frame} + + + + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Day 3 -- study design: choosing n} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{choosing n for confidence intervals} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame} + \frametitle{general theme} + \begin{enumerate} + \item make an educated guess about the true parameters + \item state how accurate/powerful you want to be + \item select $n$ based on that + \end{enumerate} +\end{frame} + +\begin{frame} + \frametitle{estimating a single mean} + \framesubtitle{standard error and $\alpha$} + \begin{itemize} + \item Assume you have an estimate $s$ of the standard deviation from + the literature. + \item The $95$\% confidence interval is given by + $$\underbrace{|\tilde\mu - \mu_0|}_{=:\delta} \ge t_{97.5\%, + \nu}\frac{s}{\sqrt{n}}$$\pause + \item How should we choose $n$ to get a confidence interval of a + particular size $\pm \delta$?\pause + \item[] We should set $n$ to be + $$n \ge \left(\frac{t_{97.5\%, \nu}\cdot s}{\delta}\right)^2 $$ + \end{itemize} + + +\end{frame} + +\begin{frame} + \frametitle{exercise} + \begin{task}{choosing $n$} + Example from last lecture: Literature value of thymus gland + weights is $34.3$g. The estimate of the standard deviation from + the literature is $s=10$g. + + The equation for $n$ is + $$n \ge \left(\frac{t_{97.5\%, \nu}\cdot s}{\delta}\right)^2 $$ + \begin{itemize} + \item Assume we want to sacrifice as few animals as possible. We + say we are fine with a confidence interval of size $\pm\delta=5$, how + should we choose $n$? + \item What $n$ should we choose for $n$ if we want $\pm\delta=2$? + \end{itemize} + Extend your bootstrapping script from yesterday to check that the + equation is correct. + \end{task} +\end{frame} + +\begin{frame}[fragile] + \frametitle{How to interrupt for/while loops} + \begin{itemize} + \item Sometimes you want to stop a for/while loop early. + \item The command for that is {\tt break} + \end{itemize} +{\bf Example} +\begin{lstlisting} +% silly way to find a random number larger than .8 +for i = 1:2000 + u = rand(); + if u >= .8 + disp('Found it!'); + break + end +end +\end{lstlisting} +\end{frame} + + +\begin{frame} + \frametitle{winner's curse} + \begin{task}{Why it is important to estimate $n$ beforehand} + Use the thymus gland dataset to repeat the following procedure + \begin{enumerate} + \item Randomly select $n=10$ numbers from the whole dataset. + \item Perform a one-sample ttest ({\tt ttest}) to test against the + mean of $34.3$g. + \item If the p-value is smaller than $0.05$, stop the loop and + print the mean of the $10$ datapoints. Also print the mean of + the entire thymus gland dataset. + \item Why is it better to use a {\tt for} instead of a {\tt while} loop? + \item What can you observe? Why does that tell you that choosing + $n$ is important? + \end{enumerate} + \end{task} +\end{frame} + +\begin{frame}[fragile] + \frametitle{solution} +\scriptsize + \begin{lstlisting} +load thymusglandweights.dat + +n = 10; + +x = thymusglandweights; + +for i = 1:5000 + idx = randi(length(x), n,1); + y = x(idx); + [h,p] = ttest(y, 34.3); + + if h == 1 + disp(['p-value: ', num2str(p)]); + disp(['mu: ', num2str(mean(y))]); + disp(['mu total: ', num2str(mean(x))]); + break + end +end + \end{lstlisting} +\end{frame} + +\subsection{power} + +\begin{frame} + \frametitle{test nomenclature} + \begin{center} + \only<1>{\includegraphics[width=\linewidth]{figs/testframework00.pdf}} + \only<2>{\includegraphics[width=\linewidth]{figs/testframework01.pdf}} + \end{center} +\small +\begin{columns} + \begin{column}[l]{.5\linewidth} +{\bf You want:} +\begin{itemize} +\item large power +\item small type I \& II error probability ($\alpha$ and $\beta$) +\end{itemize} + \end{column} + \begin{column}[r]{.5\linewidth} + \end{column} +\end{columns} +\end{frame} + +\begin{frame} + \frametitle{power} + \begin{task}{estimating power with bootstrapping} + \begin{itemize} + \item Take the script from yesterday in which we simulated the + null distribution of the means. + \item Extend it such that it plots the bootstrapped distribution + of the means as well (use the same bins for both histograms by + using {\tt hist} for computing the histogram and {\tt bar} for + plotting). + \item Use logical indexing to find all means that correspond to + true positives (using the 95\% decision boundaries computed + yesterday). Estimate the power by computing the fraction of true + positive bootstrapped means. + \item What is the probability that you get a false negative? + \item If you have time, plot the histogram of true positives in a + different color. + \end{itemize} + \end{task} +\end{frame} + +\begin{frame} + \frametitle{summary} + \begin{itemize} + \item Proper study design is important to avoid statistical problems + like the winner's curse. + \item You should choose a test with high power. + \item There are also equations to select $n$ for type I error {\em + and} power (see book by Zar). + \end{itemize} +\end{frame} + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Day 4-5 -- curve fitting and maximum likelihood} +\begin{frame} + \frametitle{Overview} + \begin{itemize} + \item minimizing/maximizing a function numerically (optimization) is + ubiquitous in science (curve fitting, maximum likelihood, ...) + \item today we will look at the basic elements of optimization and + apply it to curve fitting + \item tomorrow, we will apply it to maximum likelihood + \end{itemize} +\end{frame} + +\begin{frame}[fragile] + \frametitle{plotting surfaces} +\begin{lstlisting} +range = linspace(-1,1,20); +[X,Y] = meshgrid(range, range); + +surf(X,Y, (X.^2 + Y.^2)); +colormap('winter'); + +\end{lstlisting} +\end{frame} + +\begin{frame} + \frametitle{linear least squares} + \begin{minipage}{1.0\linewidth} + \begin{minipage}{0.3\linewidth} + \includegraphics[width=\linewidth]{figs/leastsquares.png} + \source{http://en.wikipedia.org/wiki/Linear\_least_squares\_\%28mathematics\%29} + \end{minipage} + \begin{minipage}{0.7\linewidth} + \begin{itemize} + \item The most common curve fitting problem is {\em linear least + squares}. + \item Its goal is to predict a set of output values $y_1, ..., + y_n$ from their corresponding input values $x_1,...,x_n$ with + a line $f_{a,b}(x) = a x+b$. + \item How is the line chosen?\pause + \item[] By minimization of the mean squared error + $$g(a,b) = \sum_{i=1}^n (y_i - f_{a,b}(x_i))^2$$ + \end{itemize} + \end{minipage} + \end{minipage} +\end{frame} + +\begin{frame} + \frametitle{error surface} + \begin{task}{plotting the error surface} + Write a function {\tt lserr} that takes 2-dimensional parameter + vector (slope and offset), an array of inputs {\tt x}, and an + array of corresponding outputs {\tt y}. + \end{task} +\end{frame} + +\begin{frame} + \begin{center} + \Huge That's it. + \end{center} +\end{frame} + +\end{document} + diff --git a/statistics/lecture_statistics04.tex b/statistics/lecture_statistics04.tex new file mode 100644 index 0000000..de5b760 --- /dev/null +++ b/statistics/lecture_statistics04.tex @@ -0,0 +1,310 @@ +\documentclass{beamer} +\usepackage{xcolor} +\usepackage{listings} +\usepackage{pgf} +%\usepackage{pgf,pgfarrows,pgfnodes,pgfautomata,pgfheaps,pgfshade} +%\usepackage{multimedia} +\usepackage[latin1]{inputenc} +\usepackage{amsmath, amssymb} +\usepackage{bm} +\usepackage[T1]{fontenc} +\usepackage{hyperref} +\usepackage{ulem} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\mode +{ + \usetheme{Singapore} + \setbeamercovered{opaque} + \usecolortheme{tuebingen} + \setbeamertemplate{navigation symbols}{} + \usefonttheme{default} + \useoutertheme{infolines} + % \useoutertheme{miniframes} +} + +\AtBeginSubsection[] +{ + \begin{frame} + \begin{center} + \Huge \insertsectionhead + \end{center} + \tableofcontents[ + currentsubsection, + hideothersubsections, + sectionstyle=show/hide, + subsectionstyle=show/shaded, +] + % \frametitle{\insertsectionhead} + \end{frame} +} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5 + +\setbeamertemplate{blocks}[rounded][shadow=true] + +\title[]{Scientific Computing -- Statistics} +\author[Statistics]{Fabian Sinz\\Dept. Neuroethology, + University T\"ubingen\\ +Bernstein Center T\"ubingen} + +\institute[Scientific Computing]{} + \date{10/23/2014} +%\logo{\pgfuseimage{logo}} + +\subject{Lectures} + +%%%%%%%%%% configuration for code +\lstset{ + basicstyle=\ttfamily, + numbers=left, + showstringspaces=false, + language=Matlab, + commentstyle=\itshape\color{darkgray}, + keywordstyle=\color{blue}, + stringstyle=\color{green}, + backgroundcolor=\color{blue!10}, + breaklines=true, + breakautoindent=true, + columns=flexible, + frame=single, + captionpos=b, + xleftmargin=1em, + xrightmargin=1em, + aboveskip=10pt + } +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\newcommand{\mycite}[1]{ +\begin{flushright} +\tiny \color{black!80} #1 +\end{flushright} +} + +\input{../latex/environments.tex} +\makeatother + +\begin{document} + +\begin{frame} + \titlepage + +\end{frame} + + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Day 4-5 -- curve fitting and maximum likelihood} +\subsection{curve fitting and optimization} +\begin{frame} + \frametitle{Overview} + \begin{itemize} + \item minimizing/maximizing a function numerically (optimization) is + ubiquitous in science (curve fitting, maximum likelihood, ...) + \item today we will look at the basic elements of optimization and + apply it to curve fitting + \item tomorrow, we will apply it to maximum likelihood + \end{itemize} +\end{frame} + +\begin{frame}[fragile] + \frametitle{plotting surfaces} +\begin{lstlisting} +range = linspace(-1,1,20); +[X,Y] = meshgrid(range, range); + +surf(X,Y, (X.^2 + Y.^2)); +colormap('winter'); +\end{lstlisting} +\end{frame} + +\begin{frame} + \frametitle{linear least squares} + \begin{minipage}{1.0\linewidth} + \begin{minipage}{0.3\linewidth} + \includegraphics[width=\linewidth]{figs/leastsquares.png} + \source{http://en.wikipedia.org/wiki/Linear\_least_squares\_\%28mathematics\%29} + \end{minipage} + \begin{minipage}{0.7\linewidth} + \begin{itemize} + \item The most common curve fitting problem is {\em linear least + squares}. + \item Its goal is to predict a set of output values $y_1, ..., + y_n$ from their corresponding input values $x_1,...,x_n$ with + a line $f_{a,b}(x) = a x+b$. + \item How is the line chosen?\pause + \item[] By minimization of the mean squared error + $$g(a,b) = \sum_{i=1}^n (y_i - f_{a,b}(x_i))^2$$ + \end{itemize} + \end{minipage} + \end{minipage} +\end{frame} + +\begin{frame} + \frametitle{error surface} + \begin{task}{plotting the error surface} + \begin{itemize} + \item Write a function {\tt lserr} that takes 2-dimensional + parameter vector (slope $a$ and offset $b$), an array of inputs + {\tt x}, an array of corresponding outputs {\tt y}, and compute + the least squares error + $$g(a,b) = \sum_{i=1}^n (y_i - f_{a,b}(x_i))^2$$ + with $$f_{a,b}(x_i) = a x_i + b.$$ + \item Generate an example dataset with {\tt x=linspace(-5,5,20)} + and {\tt y = .5*x + 1 + randn(length(x),1)}. + \item Write a script that plots the error surface as a function + of $a$ and $b$. + \end{itemize} + \end{task} +\end{frame} + +\begin{frame} + \frametitle{optima and derivatives} + \begin{itemize} + \item How did you find maxima/minima of functions at school?\pause + \item[] Compute the derivative, set it to zero, and solve for $x$.\pause + \item Can anybody remember how a derivative is defined (hint: + differential quotient)?\pause + \item[]$$f'(x) = \lim_{h\rightarrow 0} \frac{f(x+h) - f(x)}{h}$$ + \item Could you write down this expression for the partial + derivative $\frac{\partial g(a,b)}{\partial a}$ of $g(a,b)$ w.r.t. $a$?\pause + \item[]$$\frac{\partial g(a,b)}{\partial a} = \lim_{h\rightarrow 0} + \frac{g(a+h,b) - g(a,b)}{h}$$\pause + \item What about $\frac{\partial g(a,b)}{\partial b}$? + \end{itemize} +\end{frame} + +\begin{frame} + \frametitle{gradient and numerical derivative} + \begin{definition}{gradient} + The {\em gradient} $$\nabla g(a,b) = \left(\frac{\partial + g(a,b)}{\partial a}, \frac{\partial g(a,b)}{\partial b} + \right)$$ is the vector with partial derivatives of $g$ w.r.t. $a$ + and $b$. + \end{definition} + We can numerically approximate it, by using the definition of the + derivative + $$\frac{\partial g(a,b)}{\partial a} = \lim_{h\rightarrow 0} + \frac{g(a+h,b) - g(a,b)}{h} \approx \frac{g(a+h,b) - g(a,b)}{h},$$ + for very small $h$ (e.g. {\tt h=1e-6}). +\end{frame} + + +\begin{frame} + \frametitle{error surface} + \begin{task}{plotting the gradient field} + \begin{itemize} + \item Write a function {\tt lserr\_gradient} that takes the same + arguments as {\tt lserr}, but numerically computes the gradient + $$\nabla g(a,b) = \left(\frac{\partial g(a,b)}{\partial a}, \frac{\partial g(a,b)}{\partial b} \right)$$ + \item Add the gradient field as a vector field to your plot (use + {\tt quiver}). + \item Add a contour plot of the error surface as well (use {\tt contour}). + \item What can you observer about the directions of the gradient + with respect to the contour lines? + \end{itemize} + \end{task} +\end{frame} + + +\begin{frame} + \frametitle{gradient descent} + \begin{itemize} + \item The gradient $\nabla g(a,b)$ always points in the direction + of steepest ascent. \pause + \item How do we get the direction of steepest descent? \pause + \item[] We take minus the gradient $-\nabla g(a,b)$. \pause + \end{itemize} + {\bf gradient descent algorithm} + \begin{enumerate} + \item Start at some starting point $\mathbf p_0 = (a_0,b_0)$. + \item Repeat while gradient is large enough + \begin{itemize} + \item Compute the gradient at the current position $\mathbf p_t=(a_t,b_t)$. + \item Walk a small step into the gradient direction via $$\mathbf p_{t+1} = + \mathbf p_{t} - \varepsilon \nabla g(a_t,b_t)$$ where + $\varepsilon$ is a small number. + \end{itemize} + \end{enumerate} +\end{frame} + +\begin{frame} + \frametitle{gradient descent} + \begin{task}{gradient descent} + \begin{itemize} + \item Implement a gradient descent for our linear regression + problem. + \item At each step in the algorithm, plot the error surface and + the current parameter point (hint use {\tt plot3} to plot a + point in 3D). + \item At each step also plot the linear regression line along with + the data points in a separate plot. + \item It is a good idea to use {\tt pause(.1)} after each plot, so + matlab has time updating the plots and you have time watching + the gradient descent at work. + \end{itemize} + + \end{task} +\end{frame} + + +\begin{frame}[fragile] + \frametitle{optimization with matlab} +\scriptsize +A little adaptation for the objective function +\begin{lstlisting} +function [err, grad] = lserr(param, x, y) + err = mean( (param(1)*x + param(2) - y).^2 ); + + if nargout == 2 + grad = lserr_gradient(param, x,y); + end +\end{lstlisting} +The actual optimization +\begin{lstlisting} +function param = estimate_regression(x,y, param0) + myfunc = @(p)(lserr(p,x,y)); + param = fminunc(myfunc,param0, options); +\end{lstlisting} +\end{frame} + +\begin{frame} + \frametitle{nonlinear regression} + \scriptsize + \begin{task}{fit a charging curve} + The following problem arises when estimating the time constant of + a membrane from data. + + \vspace{.5cm} + + \begin{minipage}{1.0\linewidth} + \begin{minipage}{0.5\linewidth} + \begin{center} + \includegraphics[width=\linewidth]{figs/charging.png} + \end{center} + \end{minipage} + \begin{minipage}{0.5\linewidth} + \begin{itemize} + \item Download the data {\tt membraneVoltage.mat}. It contains + the points plotted on the right hand side. + \item Write a nonlinear least squares fit to fit the function + $$ f_{A,\tau}(t) = A\cdot \left(1 - + e^{-\frac{t}{\tau}}\right)$$ + to the data. + \item This looks scary, but it is not: If you programmed + everything correctly beforehand you only need to adapt the + function {\tt lserr} and use the optimization from the slide + before. + \item Plot the final result along with the data points. + \end{itemize} + \end{minipage} + \end{minipage} + \end{task} +\end{frame} + +\begin{frame} + \begin{center} + \Huge That's it. + \end{center} +\end{frame} + +\end{document} + diff --git a/statistics/matlab/ci_mean.m b/statistics/matlab/ci_mean.m index 0439252..936609f 100644 --- a/statistics/matlab/ci_mean.m +++ b/statistics/matlab/ci_mean.m @@ -1,6 +1,6 @@ load thymusglandweights.dat -n = 80; +n = 16; x = thymusglandweights(1:n); m = 5000; diff --git a/statistics/matlab/estimate_regression.m b/statistics/matlab/estimate_regression.m new file mode 100644 index 0000000..9fd9900 --- /dev/null +++ b/statistics/matlab/estimate_regression.m @@ -0,0 +1,5 @@ +function param = estimate_regression(x,y, param0) + options = optimoptions(@fminunc,'GradObj','on'); + myfunc = @(p)(lserr(p,x,y)); + + param = fminunc(myfunc,param0, options); diff --git a/statistics/matlab/lserr.m b/statistics/matlab/lserr.m new file mode 100644 index 0000000..f54221f --- /dev/null +++ b/statistics/matlab/lserr.m @@ -0,0 +1,6 @@ +function [err, grad] = lserr(param, x, y) + err = mean( (param(1)*x + param(2) - y).^2 ); + + if nargout == 2 + grad = lserr_gradient(param, x,y); + end \ No newline at end of file diff --git a/statistics/matlab/lserr_gradient.m b/statistics/matlab/lserr_gradient.m new file mode 100644 index 0000000..51c779b --- /dev/null +++ b/statistics/matlab/lserr_gradient.m @@ -0,0 +1,10 @@ +function grad = lserr_gradient(param, x, y) + h = 1e-6; + + grad = 0*param; + for i = 1:length(param) + paramh = param; + paramh(i) = param(i) + h; + grad(i) = (lserr(paramh,x,y) - lserr(param,x,y))/h; + end + \ No newline at end of file diff --git a/statistics/matlab/my_gradient_descent b/statistics/matlab/my_gradient_descent new file mode 100644 index 0000000..9104020 --- /dev/null +++ b/statistics/matlab/my_gradient_descent @@ -0,0 +1,30 @@ +param0 = [1,1]; +step = 0.01; + +m = 50; +arange = linspace(0,1,m); +brange = linspace(.5,1.5, m); + +[A,B] = meshgrid(arange, brange); + +E = 0*A; + +x = linspace(-5,5,20); +y = .5*x + 1 + randn(1, length(x)); + +U = 0*A; +V = 0*A; + + +for i = 1:m + for j = 1:m + E(i,j) = lserr([A(i,j), B(i,j)], x, y); + grad = lserr_gradient([A(i,j), B(i,j)], x, y); + U(i,j) = grad(1); + V(i,j) = grad(2); + + end +end +colormap('jet'); + +surf(A,B,E, 'FaceAlpha',.5); \ No newline at end of file diff --git a/statistics/matlab/my_gradient_descent.m b/statistics/matlab/my_gradient_descent.m new file mode 100644 index 0000000..264362b --- /dev/null +++ b/statistics/matlab/my_gradient_descent.m @@ -0,0 +1,62 @@ +close all +clear + +m = 50; +arange = linspace(0,1,m); +brange = linspace(.5,1.5, m); + +[A,B] = meshgrid(arange, brange); + +E = 0*A; + +x = linspace(-5,5,20); +y = .5*x + 1 + randn(1, length(x)); + +U = 0*A; +V = 0*A; + + +for i = 1:m + for j = 1:m + E(i,j) = lserr([A(i,j), B(i,j)], x, y); + grad = lserr_gradient([A(i,j), B(i,j)], x, y); + U(i,j) = grad(1); + V(i,j) = grad(2); + + end +end +colormap('gray'); + +subplot(1,2,1); +hold on +surf(A,B,E, 'FaceAlpha',.5); +shading interp +pause +subplot(1,2,2); +plot(x,y,'ok'); + +%% + +t = linspace(-5,5,100); +param0 = [0,0]; +step = 0.01; +param = param0; + + +for i = 1:100 + err = lserr(param, x, y); + derr = lserr_gradient(param, x, y); + subplot(1,2,1); + plot3(param(1), param(2), err,'or'); + + subplot(1,2,2); + hold off + plot(x,y,'ok'); + hold on + plot(t, param(1)*t + param(2), '--k', 'LineWidth',2); + pause(0.2); + param = param - step*derr; + +end + +hold off \ No newline at end of file diff --git a/statistics/matlab/mypower.m b/statistics/matlab/mypower.m new file mode 100644 index 0000000..3a152b3 --- /dev/null +++ b/statistics/matlab/mypower.m @@ -0,0 +1,49 @@ +close all +clear all +load thymusglandweights.dat + +literature_mean = 34.3; + +x = thymusglandweights; +n = length(x); +y = x - mean(x) + literature_mean; + +m = 2000; +me_null = zeros(m,1); +me_h1 = zeros(m,1); +for i = 1:m + me_null(i) = mean(y(randi(n,n,1))); + me_h1(i) = mean(x(randi(n,n,1))); +end + +bins = linspace(34,35,100); + +null = hist(me_null, bins); +h1 = hist(me_h1, bins); +bar(bins, null, 'FaceColor',[.3,.3,.3]); +hold on +bar(bins, h1, 'FaceColor',[.7,.7,.7]); +mu = mean(x); +plot([mu,mu],[0,200],'--r','LineWidth',3); +xlabel('thymus gland weights [g]'); +ylabel('frequency'); +title('bootstrapped null distribution'); +hold off + +% 5% significance boundaries +low = quantile(me_null,0.025); +high = quantile(me_null,0.975); +disp(['the 5% boundaries are: ', num2str(low), ' ', num2str(high)]); + +hold on +plot([low,low],[0,200],'--g','LineWidth',3); +plot([high,high],[0,200],'--g','LineWidth',3); +hold off + +idx = abs(me_h1-literature_mean) > abs(literature_mean - low); +pow = mean(idx); +h1positive = hist(me_h1(idx), bins); +hold on +bar(bins, h1positive, 'FaceColor','g'); +hold off + diff --git a/statistics/matlab/plot_error_surface.m b/statistics/matlab/plot_error_surface.m new file mode 100644 index 0000000..41eecd5 --- /dev/null +++ b/statistics/matlab/plot_error_surface.m @@ -0,0 +1,38 @@ +close all; +clear; + +m = 50; +arange = linspace(0,1,m); +brange = linspace(.5,1.5, m); + +[A,B] = meshgrid(arange, brange); + +E = 0*A; + +x = linspace(-5,5,20); +y = .5*x + 1 + randn(1, length(x)); + +U = 0*A; +V = 0*A; + + +for i = 1:m + for j = 1:m + E(i,j) = lserr([A(i,j), B(i,j)], x, y); + grad = lserr_gradient([A(i,j), B(i,j)], x, y); + U(i,j) = grad(1); + V(i,j) = grad(2); + + end +end +colormap('jet'); + +surf(A,B,E, 'FaceAlpha',.5); +%shading interp; +hold on +contour(A,B,E, 50, 'LineColor', 'k') +quiver(A,B,U,V); +xlabel('a'); +ylabel('b'); +zlabel('mean square error') +axis([0,1,.5,1.5]) diff --git a/statistics/matlab/tests.m b/statistics/matlab/tests.m index 9c16a29..4db2318 100644 --- a/statistics/matlab/tests.m +++ b/statistics/matlab/tests.m @@ -11,7 +11,7 @@ y = x - mean(x) + literature_mean; m = 2000; me = zeros(m,1); for i = 1:m - me(i) = median(y(randi(n,n,1))); + me(i) = mean(y(randi(n,n,1))); end hist(me, 50); diff --git a/statistics/matlab/winners_curse.m b/statistics/matlab/winners_curse.m new file mode 100644 index 0000000..388e841 --- /dev/null +++ b/statistics/matlab/winners_curse.m @@ -0,0 +1,21 @@ +load thymusglandweights.dat + +n = 10; + +x = thymusglandweights; + +m = 5000; +for i = 1:m + idx = randi(length(x), n,1); + y = x(idx); + [h,p] = ttest(y, 34.3); + + if h == 1 + disp(['p-value: ', num2str(p)]); + disp(['mu: ', num2str(mean(y))]); + disp(['mu total: ', num2str(mean(x))]); + break + end +end + +