done
This commit is contained in:
parent
d9921cc9ae
commit
763b332eea
65
statistics/assignments/day3.tex
Normal file
65
statistics/assignments/day3.tex
Normal file
@ -0,0 +1,65 @@
|
||||
\documentclass[addpoints,10pt]{exam}
|
||||
\usepackage{url}
|
||||
\usepackage{color}
|
||||
\usepackage{hyperref}
|
||||
|
||||
\pagestyle{headandfoot}
|
||||
\runningheadrule
|
||||
\firstpageheadrule
|
||||
|
||||
\firstpageheader{Scientific Computing}{afternoon assignment day 02}{10/22/2014}
|
||||
%\runningheader{Homework 01}{Page \thepage\ of \numpages}{23. October 2014}
|
||||
\firstpagefooter{}{}{}
|
||||
\runningfooter{}{}{}
|
||||
\pointsinmargin
|
||||
\bracketedpoints
|
||||
|
||||
%\printanswers
|
||||
\shadedsolutions
|
||||
|
||||
|
||||
\begin{document}
|
||||
%%%%%%%%%%%%%%%%%%%%% Submission instructions %%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\sffamily
|
||||
%%%%%%%%%%%%%% Questions %%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
\begin{questions}
|
||||
\question When the p-value is small, we reject the null
|
||||
hypothesis. For example, if you want to test whether two means are
|
||||
not equal, the null hypothesis is ``means are equal''. If e.g. $p\le
|
||||
0.05$ then we take it as sufficient evidence that the null
|
||||
hypothesis is not true. Therefore, we assume that the means are not
|
||||
equal (which is what you want to show).
|
||||
|
||||
In this exercise we will look at what kind of p-values we expect if
|
||||
the null hypothesis is true. In our example, this would be the case
|
||||
if the true means of two datasets are actually equal.
|
||||
\begin{parts}
|
||||
\part Think about how you expect the p-values to behave in that
|
||||
situation.
|
||||
\part Simulate the situation in which the means are equal by
|
||||
repeating the following at least $1000$ times:
|
||||
\begin{enumerate}
|
||||
\item Generate two arrays {\tt x} and {\tt y} with $10$ normally
|
||||
(Gaussian) distributed random numbers using {\tt randn}. By
|
||||
construction, the true means behind the random number are zero.
|
||||
\item Perform a two sample t-test ({\tt ttest2}) on {\tt x} and
|
||||
{\tt y}. Store the p-value.
|
||||
\end{enumerate}
|
||||
\part Plot a histogram of the $1000$ p-values. What do you think
|
||||
is the distribution the p-values (i.e. if you repeated this
|
||||
experiment many more times, how would the histogram look like)?
|
||||
\part Given what you find, think about whether the following
|
||||
strategy is statistically valid: You collect $10$ data points from
|
||||
each group and perform a test. If the test is not significant, you
|
||||
collect $10$ more and repeat the test. If the test tells you that
|
||||
there is a significant difference you stop. Otherwise you repeat
|
||||
the procedure until the test is significant.
|
||||
\end{parts}
|
||||
\end{questions}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
\end{document}
|
BIN
statistics/data/membraneVoltage.mat
Normal file
BIN
statistics/data/membraneVoltage.mat
Normal file
Binary file not shown.
1
statistics/environments.tex
Symbolic link
1
statistics/environments.tex
Symbolic link
@ -0,0 +1 @@
|
||||
../latex/environments.tex
|
BIN
statistics/figs/charging.png
Normal file
BIN
statistics/figs/charging.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 5.1 KiB |
BIN
statistics/figs/experimentalDesign00.pdf
Normal file
BIN
statistics/figs/experimentalDesign00.pdf
Normal file
Binary file not shown.
BIN
statistics/figs/experimentalDesign01.pdf
Normal file
BIN
statistics/figs/experimentalDesign01.pdf
Normal file
Binary file not shown.
BIN
statistics/figs/leastsquares.png
Normal file
BIN
statistics/figs/leastsquares.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 12 KiB |
BIN
statistics/figs/testframework00.pdf
Normal file
BIN
statistics/figs/testframework00.pdf
Normal file
Binary file not shown.
BIN
statistics/figs/testframework01.pdf
Normal file
BIN
statistics/figs/testframework01.pdf
Normal file
Binary file not shown.
@ -47,7 +47,7 @@
|
||||
Bernstein Center T\"ubingen}
|
||||
|
||||
\institute[Scientific Computing]{}
|
||||
\date{10/20/2014}
|
||||
\date{10/21/2014}
|
||||
%\logo{\pgfuseimage{logo}}
|
||||
|
||||
\subject{Lectures}
|
||||
|
329
statistics/lecture_statistics03.tex
Normal file
329
statistics/lecture_statistics03.tex
Normal file
@ -0,0 +1,329 @@
|
||||
\documentclass{beamer}
|
||||
\usepackage{xcolor}
|
||||
\usepackage{listings}
|
||||
\usepackage{pgf}
|
||||
%\usepackage{pgf,pgfarrows,pgfnodes,pgfautomata,pgfheaps,pgfshade}
|
||||
%\usepackage{multimedia}
|
||||
\usepackage[latin1]{inputenc}
|
||||
\usepackage{amsmath}
|
||||
\usepackage{bm}
|
||||
\usepackage[T1]{fontenc}
|
||||
\usepackage{hyperref}
|
||||
\usepackage{ulem}
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\mode<presentation>
|
||||
{
|
||||
\usetheme{Singapore}
|
||||
\setbeamercovered{opaque}
|
||||
\usecolortheme{tuebingen}
|
||||
\setbeamertemplate{navigation symbols}{}
|
||||
\usefonttheme{default}
|
||||
\useoutertheme{infolines}
|
||||
% \useoutertheme{miniframes}
|
||||
}
|
||||
|
||||
\AtBeginSubsection[]
|
||||
{
|
||||
\begin{frame}<beamer>
|
||||
\begin{center}
|
||||
\Huge \insertsectionhead
|
||||
\end{center}
|
||||
\tableofcontents[
|
||||
currentsubsection,
|
||||
hideothersubsections,
|
||||
sectionstyle=show/hide,
|
||||
subsectionstyle=show/shaded,
|
||||
]
|
||||
% \frametitle{\insertsectionhead}
|
||||
\end{frame}
|
||||
}
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
|
||||
|
||||
\setbeamertemplate{blocks}[rounded][shadow=true]
|
||||
|
||||
\title[]{Scientific Computing -- Statistics}
|
||||
\author[Statistics]{Fabian Sinz\\Dept. Neuroethology,
|
||||
University T\"ubingen\\
|
||||
Bernstein Center T\"ubingen}
|
||||
|
||||
\institute[Scientific Computing]{}
|
||||
\date{10/22/2014}
|
||||
%\logo{\pgfuseimage{logo}}
|
||||
|
||||
\subject{Lectures}
|
||||
|
||||
%%%%%%%%%% configuration for code
|
||||
\lstset{
|
||||
basicstyle=\ttfamily,
|
||||
numbers=left,
|
||||
showstringspaces=false,
|
||||
language=Matlab,
|
||||
commentstyle=\itshape\color{darkgray},
|
||||
keywordstyle=\color{blue},
|
||||
stringstyle=\color{green},
|
||||
backgroundcolor=\color{blue!10},
|
||||
breaklines=true,
|
||||
breakautoindent=true,
|
||||
columns=flexible,
|
||||
frame=single,
|
||||
captionpos=b,
|
||||
xleftmargin=1em,
|
||||
xrightmargin=1em,
|
||||
aboveskip=10pt
|
||||
}
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
\newcommand{\mycite}[1]{
|
||||
\begin{flushright}
|
||||
\tiny \color{black!80} #1
|
||||
\end{flushright}
|
||||
}
|
||||
|
||||
\input{../latex/environments.tex}
|
||||
\makeatother
|
||||
|
||||
\begin{document}
|
||||
|
||||
\begin{frame}
|
||||
\titlepage
|
||||
|
||||
\end{frame}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\section{Day 3 -- study design: choosing n}
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\subsection{choosing n for confidence intervals}
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\begin{frame}
|
||||
\frametitle{general theme}
|
||||
\begin{enumerate}
|
||||
\item make an educated guess about the true parameters
|
||||
\item state how accurate/powerful you want to be
|
||||
\item select $n$ based on that
|
||||
\end{enumerate}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{estimating a single mean}
|
||||
\framesubtitle{standard error and $\alpha$}
|
||||
\begin{itemize}
|
||||
\item Assume you have an estimate $s$ of the standard deviation from
|
||||
the literature.
|
||||
\item The $95$\% confidence interval is given by
|
||||
$$\underbrace{|\tilde\mu - \mu_0|}_{=:\delta} \ge t_{97.5\%,
|
||||
\nu}\frac{s}{\sqrt{n}}$$\pause
|
||||
\item How should we choose $n$ to get a confidence interval of a
|
||||
particular size $\pm \delta$?\pause
|
||||
\item[] We should set $n$ to be
|
||||
$$n \ge \left(\frac{t_{97.5\%, \nu}\cdot s}{\delta}\right)^2 $$
|
||||
\end{itemize}
|
||||
|
||||
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{exercise}
|
||||
\begin{task}{choosing $n$}
|
||||
Example from last lecture: Literature value of thymus gland
|
||||
weights is $34.3$g. The estimate of the standard deviation from
|
||||
the literature is $s=10$g.
|
||||
|
||||
The equation for $n$ is
|
||||
$$n \ge \left(\frac{t_{97.5\%, \nu}\cdot s}{\delta}\right)^2 $$
|
||||
\begin{itemize}
|
||||
\item Assume we want to sacrifice as few animals as possible. We
|
||||
say we are fine with a confidence interval of size $\pm\delta=5$, how
|
||||
should we choose $n$?
|
||||
\item What $n$ should we choose for $n$ if we want $\pm\delta=2$?
|
||||
\end{itemize}
|
||||
Extend your bootstrapping script from yesterday to check that the
|
||||
equation is correct.
|
||||
\end{task}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[fragile]
|
||||
\frametitle{How to interrupt for/while loops}
|
||||
\begin{itemize}
|
||||
\item Sometimes you want to stop a for/while loop early.
|
||||
\item The command for that is {\tt break}
|
||||
\end{itemize}
|
||||
{\bf Example}
|
||||
\begin{lstlisting}
|
||||
% silly way to find a random number larger than .8
|
||||
for i = 1:2000
|
||||
u = rand();
|
||||
if u >= .8
|
||||
disp('Found it!');
|
||||
break
|
||||
end
|
||||
end
|
||||
\end{lstlisting}
|
||||
\end{frame}
|
||||
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{winner's curse}
|
||||
\begin{task}{Why it is important to estimate $n$ beforehand}
|
||||
Use the thymus gland dataset to repeat the following procedure
|
||||
\begin{enumerate}
|
||||
\item Randomly select $n=10$ numbers from the whole dataset.
|
||||
\item Perform a one-sample ttest ({\tt ttest}) to test against the
|
||||
mean of $34.3$g.
|
||||
\item If the p-value is smaller than $0.05$, stop the loop and
|
||||
print the mean of the $10$ datapoints. Also print the mean of
|
||||
the entire thymus gland dataset.
|
||||
\item Why is it better to use a {\tt for} instead of a {\tt while} loop?
|
||||
\item What can you observe? Why does that tell you that choosing
|
||||
$n$ is important?
|
||||
\end{enumerate}
|
||||
\end{task}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[fragile]
|
||||
\frametitle{solution}
|
||||
\scriptsize
|
||||
\begin{lstlisting}
|
||||
load thymusglandweights.dat
|
||||
|
||||
n = 10;
|
||||
|
||||
x = thymusglandweights;
|
||||
|
||||
for i = 1:5000
|
||||
idx = randi(length(x), n,1);
|
||||
y = x(idx);
|
||||
[h,p] = ttest(y, 34.3);
|
||||
|
||||
if h == 1
|
||||
disp(['p-value: ', num2str(p)]);
|
||||
disp(['mu: ', num2str(mean(y))]);
|
||||
disp(['mu total: ', num2str(mean(x))]);
|
||||
break
|
||||
end
|
||||
end
|
||||
\end{lstlisting}
|
||||
\end{frame}
|
||||
|
||||
\subsection{power}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{test nomenclature}
|
||||
\begin{center}
|
||||
\only<1>{\includegraphics[width=\linewidth]{figs/testframework00.pdf}}
|
||||
\only<2>{\includegraphics[width=\linewidth]{figs/testframework01.pdf}}
|
||||
\end{center}
|
||||
\small
|
||||
\begin{columns}
|
||||
\begin{column}[l]{.5\linewidth}
|
||||
{\bf You want:}
|
||||
\begin{itemize}
|
||||
\item large power
|
||||
\item small type I \& II error probability ($\alpha$ and $\beta$)
|
||||
\end{itemize}
|
||||
\end{column}
|
||||
\begin{column}[r]{.5\linewidth}
|
||||
\end{column}
|
||||
\end{columns}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{power}
|
||||
\begin{task}{estimating power with bootstrapping}
|
||||
\begin{itemize}
|
||||
\item Take the script from yesterday in which we simulated the
|
||||
null distribution of the means.
|
||||
\item Extend it such that it plots the bootstrapped distribution
|
||||
of the means as well (use the same bins for both histograms by
|
||||
using {\tt hist} for computing the histogram and {\tt bar} for
|
||||
plotting).
|
||||
\item Use logical indexing to find all means that correspond to
|
||||
true positives (using the 95\% decision boundaries computed
|
||||
yesterday). Estimate the power by computing the fraction of true
|
||||
positive bootstrapped means.
|
||||
\item What is the probability that you get a false negative?
|
||||
\item If you have time, plot the histogram of true positives in a
|
||||
different color.
|
||||
\end{itemize}
|
||||
\end{task}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{summary}
|
||||
\begin{itemize}
|
||||
\item Proper study design is important to avoid statistical problems
|
||||
like the winner's curse.
|
||||
\item You should choose a test with high power.
|
||||
\item There are also equations to select $n$ for type I error {\em
|
||||
and} power (see book by Zar).
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\section{Day 4-5 -- curve fitting and maximum likelihood}
|
||||
\begin{frame}
|
||||
\frametitle{Overview}
|
||||
\begin{itemize}
|
||||
\item minimizing/maximizing a function numerically (optimization) is
|
||||
ubiquitous in science (curve fitting, maximum likelihood, ...)
|
||||
\item today we will look at the basic elements of optimization and
|
||||
apply it to curve fitting
|
||||
\item tomorrow, we will apply it to maximum likelihood
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[fragile]
|
||||
\frametitle{plotting surfaces}
|
||||
\begin{lstlisting}
|
||||
range = linspace(-1,1,20);
|
||||
[X,Y] = meshgrid(range, range);
|
||||
|
||||
surf(X,Y, (X.^2 + Y.^2));
|
||||
colormap('winter');
|
||||
|
||||
\end{lstlisting}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{linear least squares}
|
||||
\begin{minipage}{1.0\linewidth}
|
||||
\begin{minipage}{0.3\linewidth}
|
||||
\includegraphics[width=\linewidth]{figs/leastsquares.png}
|
||||
\source{http://en.wikipedia.org/wiki/Linear\_least_squares\_\%28mathematics\%29}
|
||||
\end{minipage}
|
||||
\begin{minipage}{0.7\linewidth}
|
||||
\begin{itemize}
|
||||
\item The most common curve fitting problem is {\em linear least
|
||||
squares}.
|
||||
\item Its goal is to predict a set of output values $y_1, ...,
|
||||
y_n$ from their corresponding input values $x_1,...,x_n$ with
|
||||
a line $f_{a,b}(x) = a x+b$.
|
||||
\item How is the line chosen?\pause
|
||||
\item[] By minimization of the mean squared error
|
||||
$$g(a,b) = \sum_{i=1}^n (y_i - f_{a,b}(x_i))^2$$
|
||||
\end{itemize}
|
||||
\end{minipage}
|
||||
\end{minipage}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{error surface}
|
||||
\begin{task}{plotting the error surface}
|
||||
Write a function {\tt lserr} that takes 2-dimensional parameter
|
||||
vector (slope and offset), an array of inputs {\tt x}, and an
|
||||
array of corresponding outputs {\tt y}.
|
||||
\end{task}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\begin{center}
|
||||
\Huge That's it.
|
||||
\end{center}
|
||||
\end{frame}
|
||||
|
||||
\end{document}
|
||||
|
310
statistics/lecture_statistics04.tex
Normal file
310
statistics/lecture_statistics04.tex
Normal file
@ -0,0 +1,310 @@
|
||||
\documentclass{beamer}
|
||||
\usepackage{xcolor}
|
||||
\usepackage{listings}
|
||||
\usepackage{pgf}
|
||||
%\usepackage{pgf,pgfarrows,pgfnodes,pgfautomata,pgfheaps,pgfshade}
|
||||
%\usepackage{multimedia}
|
||||
\usepackage[latin1]{inputenc}
|
||||
\usepackage{amsmath, amssymb}
|
||||
\usepackage{bm}
|
||||
\usepackage[T1]{fontenc}
|
||||
\usepackage{hyperref}
|
||||
\usepackage{ulem}
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\mode<presentation>
|
||||
{
|
||||
\usetheme{Singapore}
|
||||
\setbeamercovered{opaque}
|
||||
\usecolortheme{tuebingen}
|
||||
\setbeamertemplate{navigation symbols}{}
|
||||
\usefonttheme{default}
|
||||
\useoutertheme{infolines}
|
||||
% \useoutertheme{miniframes}
|
||||
}
|
||||
|
||||
\AtBeginSubsection[]
|
||||
{
|
||||
\begin{frame}<beamer>
|
||||
\begin{center}
|
||||
\Huge \insertsectionhead
|
||||
\end{center}
|
||||
\tableofcontents[
|
||||
currentsubsection,
|
||||
hideothersubsections,
|
||||
sectionstyle=show/hide,
|
||||
subsectionstyle=show/shaded,
|
||||
]
|
||||
% \frametitle{\insertsectionhead}
|
||||
\end{frame}
|
||||
}
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5
|
||||
|
||||
\setbeamertemplate{blocks}[rounded][shadow=true]
|
||||
|
||||
\title[]{Scientific Computing -- Statistics}
|
||||
\author[Statistics]{Fabian Sinz\\Dept. Neuroethology,
|
||||
University T\"ubingen\\
|
||||
Bernstein Center T\"ubingen}
|
||||
|
||||
\institute[Scientific Computing]{}
|
||||
\date{10/23/2014}
|
||||
%\logo{\pgfuseimage{logo}}
|
||||
|
||||
\subject{Lectures}
|
||||
|
||||
%%%%%%%%%% configuration for code
|
||||
\lstset{
|
||||
basicstyle=\ttfamily,
|
||||
numbers=left,
|
||||
showstringspaces=false,
|
||||
language=Matlab,
|
||||
commentstyle=\itshape\color{darkgray},
|
||||
keywordstyle=\color{blue},
|
||||
stringstyle=\color{green},
|
||||
backgroundcolor=\color{blue!10},
|
||||
breaklines=true,
|
||||
breakautoindent=true,
|
||||
columns=flexible,
|
||||
frame=single,
|
||||
captionpos=b,
|
||||
xleftmargin=1em,
|
||||
xrightmargin=1em,
|
||||
aboveskip=10pt
|
||||
}
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
||||
\newcommand{\mycite}[1]{
|
||||
\begin{flushright}
|
||||
\tiny \color{black!80} #1
|
||||
\end{flushright}
|
||||
}
|
||||
|
||||
\input{../latex/environments.tex}
|
||||
\makeatother
|
||||
|
||||
\begin{document}
|
||||
|
||||
\begin{frame}
|
||||
\titlepage
|
||||
|
||||
\end{frame}
|
||||
|
||||
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\section{Day 4-5 -- curve fitting and maximum likelihood}
|
||||
\subsection{curve fitting and optimization}
|
||||
\begin{frame}
|
||||
\frametitle{Overview}
|
||||
\begin{itemize}
|
||||
\item minimizing/maximizing a function numerically (optimization) is
|
||||
ubiquitous in science (curve fitting, maximum likelihood, ...)
|
||||
\item today we will look at the basic elements of optimization and
|
||||
apply it to curve fitting
|
||||
\item tomorrow, we will apply it to maximum likelihood
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}[fragile]
|
||||
\frametitle{plotting surfaces}
|
||||
\begin{lstlisting}
|
||||
range = linspace(-1,1,20);
|
||||
[X,Y] = meshgrid(range, range);
|
||||
|
||||
surf(X,Y, (X.^2 + Y.^2));
|
||||
colormap('winter');
|
||||
\end{lstlisting}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{linear least squares}
|
||||
\begin{minipage}{1.0\linewidth}
|
||||
\begin{minipage}{0.3\linewidth}
|
||||
\includegraphics[width=\linewidth]{figs/leastsquares.png}
|
||||
\source{http://en.wikipedia.org/wiki/Linear\_least_squares\_\%28mathematics\%29}
|
||||
\end{minipage}
|
||||
\begin{minipage}{0.7\linewidth}
|
||||
\begin{itemize}
|
||||
\item The most common curve fitting problem is {\em linear least
|
||||
squares}.
|
||||
\item Its goal is to predict a set of output values $y_1, ...,
|
||||
y_n$ from their corresponding input values $x_1,...,x_n$ with
|
||||
a line $f_{a,b}(x) = a x+b$.
|
||||
\item How is the line chosen?\pause
|
||||
\item[] By minimization of the mean squared error
|
||||
$$g(a,b) = \sum_{i=1}^n (y_i - f_{a,b}(x_i))^2$$
|
||||
\end{itemize}
|
||||
\end{minipage}
|
||||
\end{minipage}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{error surface}
|
||||
\begin{task}{plotting the error surface}
|
||||
\begin{itemize}
|
||||
\item Write a function {\tt lserr} that takes 2-dimensional
|
||||
parameter vector (slope $a$ and offset $b$), an array of inputs
|
||||
{\tt x}, an array of corresponding outputs {\tt y}, and compute
|
||||
the least squares error
|
||||
$$g(a,b) = \sum_{i=1}^n (y_i - f_{a,b}(x_i))^2$$
|
||||
with $$f_{a,b}(x_i) = a x_i + b.$$
|
||||
\item Generate an example dataset with {\tt x=linspace(-5,5,20)}
|
||||
and {\tt y = .5*x + 1 + randn(length(x),1)}.
|
||||
\item Write a script that plots the error surface as a function
|
||||
of $a$ and $b$.
|
||||
\end{itemize}
|
||||
\end{task}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{optima and derivatives}
|
||||
\begin{itemize}
|
||||
\item How did you find maxima/minima of functions at school?\pause
|
||||
\item[] Compute the derivative, set it to zero, and solve for $x$.\pause
|
||||
\item Can anybody remember how a derivative is defined (hint:
|
||||
differential quotient)?\pause
|
||||
\item[]$$f'(x) = \lim_{h\rightarrow 0} \frac{f(x+h) - f(x)}{h}$$
|
||||
\item Could you write down this expression for the partial
|
||||
derivative $\frac{\partial g(a,b)}{\partial a}$ of $g(a,b)$ w.r.t. $a$?\pause
|
||||
\item[]$$\frac{\partial g(a,b)}{\partial a} = \lim_{h\rightarrow 0}
|
||||
\frac{g(a+h,b) - g(a,b)}{h}$$\pause
|
||||
\item What about $\frac{\partial g(a,b)}{\partial b}$?
|
||||
\end{itemize}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{gradient and numerical derivative}
|
||||
\begin{definition}{gradient}
|
||||
The {\em gradient} $$\nabla g(a,b) = \left(\frac{\partial
|
||||
g(a,b)}{\partial a}, \frac{\partial g(a,b)}{\partial b}
|
||||
\right)$$ is the vector with partial derivatives of $g$ w.r.t. $a$
|
||||
and $b$.
|
||||
\end{definition}
|
||||
We can numerically approximate it, by using the definition of the
|
||||
derivative
|
||||
$$\frac{\partial g(a,b)}{\partial a} = \lim_{h\rightarrow 0}
|
||||
\frac{g(a+h,b) - g(a,b)}{h} \approx \frac{g(a+h,b) - g(a,b)}{h},$$
|
||||
for very small $h$ (e.g. {\tt h=1e-6}).
|
||||
\end{frame}
|
||||
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{error surface}
|
||||
\begin{task}{plotting the gradient field}
|
||||
\begin{itemize}
|
||||
\item Write a function {\tt lserr\_gradient} that takes the same
|
||||
arguments as {\tt lserr}, but numerically computes the gradient
|
||||
$$\nabla g(a,b) = \left(\frac{\partial g(a,b)}{\partial a}, \frac{\partial g(a,b)}{\partial b} \right)$$
|
||||
\item Add the gradient field as a vector field to your plot (use
|
||||
{\tt quiver}).
|
||||
\item Add a contour plot of the error surface as well (use {\tt contour}).
|
||||
\item What can you observer about the directions of the gradient
|
||||
with respect to the contour lines?
|
||||
\end{itemize}
|
||||
\end{task}
|
||||
\end{frame}
|
||||
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{gradient descent}
|
||||
\begin{itemize}
|
||||
\item The gradient $\nabla g(a,b)$ always points in the direction
|
||||
of steepest ascent. \pause
|
||||
\item How do we get the direction of steepest descent? \pause
|
||||
\item[] We take minus the gradient $-\nabla g(a,b)$. \pause
|
||||
\end{itemize}
|
||||
{\bf gradient descent algorithm}
|
||||
\begin{enumerate}
|
||||
\item Start at some starting point $\mathbf p_0 = (a_0,b_0)$.
|
||||
\item Repeat while gradient is large enough
|
||||
\begin{itemize}
|
||||
\item Compute the gradient at the current position $\mathbf p_t=(a_t,b_t)$.
|
||||
\item Walk a small step into the gradient direction via $$\mathbf p_{t+1} =
|
||||
\mathbf p_{t} - \varepsilon \nabla g(a_t,b_t)$$ where
|
||||
$\varepsilon$ is a small number.
|
||||
\end{itemize}
|
||||
\end{enumerate}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{gradient descent}
|
||||
\begin{task}{gradient descent}
|
||||
\begin{itemize}
|
||||
\item Implement a gradient descent for our linear regression
|
||||
problem.
|
||||
\item At each step in the algorithm, plot the error surface and
|
||||
the current parameter point (hint use {\tt plot3} to plot a
|
||||
point in 3D).
|
||||
\item At each step also plot the linear regression line along with
|
||||
the data points in a separate plot.
|
||||
\item It is a good idea to use {\tt pause(.1)} after each plot, so
|
||||
matlab has time updating the plots and you have time watching
|
||||
the gradient descent at work.
|
||||
\end{itemize}
|
||||
|
||||
\end{task}
|
||||
\end{frame}
|
||||
|
||||
|
||||
\begin{frame}[fragile]
|
||||
\frametitle{optimization with matlab}
|
||||
\scriptsize
|
||||
A little adaptation for the objective function
|
||||
\begin{lstlisting}
|
||||
function [err, grad] = lserr(param, x, y)
|
||||
err = mean( (param(1)*x + param(2) - y).^2 );
|
||||
|
||||
if nargout == 2
|
||||
grad = lserr_gradient(param, x,y);
|
||||
end
|
||||
\end{lstlisting}
|
||||
The actual optimization
|
||||
\begin{lstlisting}
|
||||
function param = estimate_regression(x,y, param0)
|
||||
myfunc = @(p)(lserr(p,x,y));
|
||||
param = fminunc(myfunc,param0, options);
|
||||
\end{lstlisting}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\frametitle{nonlinear regression}
|
||||
\scriptsize
|
||||
\begin{task}{fit a charging curve}
|
||||
The following problem arises when estimating the time constant of
|
||||
a membrane from data.
|
||||
|
||||
\vspace{.5cm}
|
||||
|
||||
\begin{minipage}{1.0\linewidth}
|
||||
\begin{minipage}{0.5\linewidth}
|
||||
\begin{center}
|
||||
\includegraphics[width=\linewidth]{figs/charging.png}
|
||||
\end{center}
|
||||
\end{minipage}
|
||||
\begin{minipage}{0.5\linewidth}
|
||||
\begin{itemize}
|
||||
\item Download the data {\tt membraneVoltage.mat}. It contains
|
||||
the points plotted on the right hand side.
|
||||
\item Write a nonlinear least squares fit to fit the function
|
||||
$$ f_{A,\tau}(t) = A\cdot \left(1 -
|
||||
e^{-\frac{t}{\tau}}\right)$$
|
||||
to the data.
|
||||
\item This looks scary, but it is not: If you programmed
|
||||
everything correctly beforehand you only need to adapt the
|
||||
function {\tt lserr} and use the optimization from the slide
|
||||
before.
|
||||
\item Plot the final result along with the data points.
|
||||
\end{itemize}
|
||||
\end{minipage}
|
||||
\end{minipage}
|
||||
\end{task}
|
||||
\end{frame}
|
||||
|
||||
\begin{frame}
|
||||
\begin{center}
|
||||
\Huge That's it.
|
||||
\end{center}
|
||||
\end{frame}
|
||||
|
||||
\end{document}
|
||||
|
@ -1,6 +1,6 @@
|
||||
load thymusglandweights.dat
|
||||
|
||||
n = 80;
|
||||
n = 16;
|
||||
x = thymusglandweights(1:n);
|
||||
|
||||
m = 5000;
|
||||
|
5
statistics/matlab/estimate_regression.m
Normal file
5
statistics/matlab/estimate_regression.m
Normal file
@ -0,0 +1,5 @@
|
||||
function param = estimate_regression(x,y, param0)
|
||||
options = optimoptions(@fminunc,'GradObj','on');
|
||||
myfunc = @(p)(lserr(p,x,y));
|
||||
|
||||
param = fminunc(myfunc,param0, options);
|
6
statistics/matlab/lserr.m
Normal file
6
statistics/matlab/lserr.m
Normal file
@ -0,0 +1,6 @@
|
||||
function [err, grad] = lserr(param, x, y)
|
||||
err = mean( (param(1)*x + param(2) - y).^2 );
|
||||
|
||||
if nargout == 2
|
||||
grad = lserr_gradient(param, x,y);
|
||||
end
|
10
statistics/matlab/lserr_gradient.m
Normal file
10
statistics/matlab/lserr_gradient.m
Normal file
@ -0,0 +1,10 @@
|
||||
function grad = lserr_gradient(param, x, y)
|
||||
h = 1e-6;
|
||||
|
||||
grad = 0*param;
|
||||
for i = 1:length(param)
|
||||
paramh = param;
|
||||
paramh(i) = param(i) + h;
|
||||
grad(i) = (lserr(paramh,x,y) - lserr(param,x,y))/h;
|
||||
end
|
||||
|
30
statistics/matlab/my_gradient_descent
Normal file
30
statistics/matlab/my_gradient_descent
Normal file
@ -0,0 +1,30 @@
|
||||
param0 = [1,1];
|
||||
step = 0.01;
|
||||
|
||||
m = 50;
|
||||
arange = linspace(0,1,m);
|
||||
brange = linspace(.5,1.5, m);
|
||||
|
||||
[A,B] = meshgrid(arange, brange);
|
||||
|
||||
E = 0*A;
|
||||
|
||||
x = linspace(-5,5,20);
|
||||
y = .5*x + 1 + randn(1, length(x));
|
||||
|
||||
U = 0*A;
|
||||
V = 0*A;
|
||||
|
||||
|
||||
for i = 1:m
|
||||
for j = 1:m
|
||||
E(i,j) = lserr([A(i,j), B(i,j)], x, y);
|
||||
grad = lserr_gradient([A(i,j), B(i,j)], x, y);
|
||||
U(i,j) = grad(1);
|
||||
V(i,j) = grad(2);
|
||||
|
||||
end
|
||||
end
|
||||
colormap('jet');
|
||||
|
||||
surf(A,B,E, 'FaceAlpha',.5);
|
62
statistics/matlab/my_gradient_descent.m
Normal file
62
statistics/matlab/my_gradient_descent.m
Normal file
@ -0,0 +1,62 @@
|
||||
close all
|
||||
clear
|
||||
|
||||
m = 50;
|
||||
arange = linspace(0,1,m);
|
||||
brange = linspace(.5,1.5, m);
|
||||
|
||||
[A,B] = meshgrid(arange, brange);
|
||||
|
||||
E = 0*A;
|
||||
|
||||
x = linspace(-5,5,20);
|
||||
y = .5*x + 1 + randn(1, length(x));
|
||||
|
||||
U = 0*A;
|
||||
V = 0*A;
|
||||
|
||||
|
||||
for i = 1:m
|
||||
for j = 1:m
|
||||
E(i,j) = lserr([A(i,j), B(i,j)], x, y);
|
||||
grad = lserr_gradient([A(i,j), B(i,j)], x, y);
|
||||
U(i,j) = grad(1);
|
||||
V(i,j) = grad(2);
|
||||
|
||||
end
|
||||
end
|
||||
colormap('gray');
|
||||
|
||||
subplot(1,2,1);
|
||||
hold on
|
||||
surf(A,B,E, 'FaceAlpha',.5);
|
||||
shading interp
|
||||
pause
|
||||
subplot(1,2,2);
|
||||
plot(x,y,'ok');
|
||||
|
||||
%%
|
||||
|
||||
t = linspace(-5,5,100);
|
||||
param0 = [0,0];
|
||||
step = 0.01;
|
||||
param = param0;
|
||||
|
||||
|
||||
for i = 1:100
|
||||
err = lserr(param, x, y);
|
||||
derr = lserr_gradient(param, x, y);
|
||||
subplot(1,2,1);
|
||||
plot3(param(1), param(2), err,'or');
|
||||
|
||||
subplot(1,2,2);
|
||||
hold off
|
||||
plot(x,y,'ok');
|
||||
hold on
|
||||
plot(t, param(1)*t + param(2), '--k', 'LineWidth',2);
|
||||
pause(0.2);
|
||||
param = param - step*derr;
|
||||
|
||||
end
|
||||
|
||||
hold off
|
49
statistics/matlab/mypower.m
Normal file
49
statistics/matlab/mypower.m
Normal file
@ -0,0 +1,49 @@
|
||||
close all
|
||||
clear all
|
||||
load thymusglandweights.dat
|
||||
|
||||
literature_mean = 34.3;
|
||||
|
||||
x = thymusglandweights;
|
||||
n = length(x);
|
||||
y = x - mean(x) + literature_mean;
|
||||
|
||||
m = 2000;
|
||||
me_null = zeros(m,1);
|
||||
me_h1 = zeros(m,1);
|
||||
for i = 1:m
|
||||
me_null(i) = mean(y(randi(n,n,1)));
|
||||
me_h1(i) = mean(x(randi(n,n,1)));
|
||||
end
|
||||
|
||||
bins = linspace(34,35,100);
|
||||
|
||||
null = hist(me_null, bins);
|
||||
h1 = hist(me_h1, bins);
|
||||
bar(bins, null, 'FaceColor',[.3,.3,.3]);
|
||||
hold on
|
||||
bar(bins, h1, 'FaceColor',[.7,.7,.7]);
|
||||
mu = mean(x);
|
||||
plot([mu,mu],[0,200],'--r','LineWidth',3);
|
||||
xlabel('thymus gland weights [g]');
|
||||
ylabel('frequency');
|
||||
title('bootstrapped null distribution');
|
||||
hold off
|
||||
|
||||
% 5% significance boundaries
|
||||
low = quantile(me_null,0.025);
|
||||
high = quantile(me_null,0.975);
|
||||
disp(['the 5% boundaries are: ', num2str(low), ' ', num2str(high)]);
|
||||
|
||||
hold on
|
||||
plot([low,low],[0,200],'--g','LineWidth',3);
|
||||
plot([high,high],[0,200],'--g','LineWidth',3);
|
||||
hold off
|
||||
|
||||
idx = abs(me_h1-literature_mean) > abs(literature_mean - low);
|
||||
pow = mean(idx);
|
||||
h1positive = hist(me_h1(idx), bins);
|
||||
hold on
|
||||
bar(bins, h1positive, 'FaceColor','g');
|
||||
hold off
|
||||
|
38
statistics/matlab/plot_error_surface.m
Normal file
38
statistics/matlab/plot_error_surface.m
Normal file
@ -0,0 +1,38 @@
|
||||
close all;
|
||||
clear;
|
||||
|
||||
m = 50;
|
||||
arange = linspace(0,1,m);
|
||||
brange = linspace(.5,1.5, m);
|
||||
|
||||
[A,B] = meshgrid(arange, brange);
|
||||
|
||||
E = 0*A;
|
||||
|
||||
x = linspace(-5,5,20);
|
||||
y = .5*x + 1 + randn(1, length(x));
|
||||
|
||||
U = 0*A;
|
||||
V = 0*A;
|
||||
|
||||
|
||||
for i = 1:m
|
||||
for j = 1:m
|
||||
E(i,j) = lserr([A(i,j), B(i,j)], x, y);
|
||||
grad = lserr_gradient([A(i,j), B(i,j)], x, y);
|
||||
U(i,j) = grad(1);
|
||||
V(i,j) = grad(2);
|
||||
|
||||
end
|
||||
end
|
||||
colormap('jet');
|
||||
|
||||
surf(A,B,E, 'FaceAlpha',.5);
|
||||
%shading interp;
|
||||
hold on
|
||||
contour(A,B,E, 50, 'LineColor', 'k')
|
||||
quiver(A,B,U,V);
|
||||
xlabel('a');
|
||||
ylabel('b');
|
||||
zlabel('mean square error')
|
||||
axis([0,1,.5,1.5])
|
@ -11,7 +11,7 @@ y = x - mean(x) + literature_mean;
|
||||
m = 2000;
|
||||
me = zeros(m,1);
|
||||
for i = 1:m
|
||||
me(i) = median(y(randi(n,n,1)));
|
||||
me(i) = mean(y(randi(n,n,1)));
|
||||
end
|
||||
|
||||
hist(me, 50);
|
||||
|
21
statistics/matlab/winners_curse.m
Normal file
21
statistics/matlab/winners_curse.m
Normal file
@ -0,0 +1,21 @@
|
||||
load thymusglandweights.dat
|
||||
|
||||
n = 10;
|
||||
|
||||
x = thymusglandweights;
|
||||
|
||||
m = 5000;
|
||||
for i = 1:m
|
||||
idx = randi(length(x), n,1);
|
||||
y = x(idx);
|
||||
[h,p] = ttest(y, 34.3);
|
||||
|
||||
if h == 1
|
||||
disp(['p-value: ', num2str(p)]);
|
||||
disp(['mu: ', num2str(mean(y))]);
|
||||
disp(['mu total: ', num2str(mean(x))]);
|
||||
break
|
||||
end
|
||||
end
|
||||
|
||||
|
Reference in New Issue
Block a user