improved statistics chapter

This commit is contained in:
Jan Benda 2017-11-25 14:15:04 +01:00
parent f362788620
commit 12a417d6bc
6 changed files with 252 additions and 121 deletions

View File

@ -216,21 +216,30 @@
%%%%% code/matlab commands: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%% code/matlab commands: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\usepackage{textcomp} \usepackage{textcomp}
% typeset code inline:
\newcommand{\varcode}[1]{\setlength{\fboxsep}{0.5ex}\colorbox{codeback}{\texttt{#1\protect\rule[-0.1ex]{0pt}{1.6ex}}}} \newcommand{\varcode}[1]{\setlength{\fboxsep}{0.5ex}\colorbox{codeback}{\texttt{#1\protect\rule[-0.1ex]{0pt}{1.6ex}}}}
\newcommand{\code}[2][]{\varcode{#2}\ifthenelse{\equal{#1}{}}{\protect\sindex[mcode]{#2}}{\protect\sindex[mcode]{#1}}} % type set code and add it to the python index:
\newcommand{\pcode}[2][]{\varcode{#2}\ifthenelse{\equal{#1}{}}{\protect\sindex[pcode]{#2}}{\protect\sindex[pcode]{#1}}} \newcommand{\pcode}[2][]{\varcode{#2}\ifthenelse{\equal{#1}{}}{\protect\sindex[pcode]{#2}}{\protect\sindex[pcode]{#1}}}
% type set code and add it to the matlab index:
\newcommand{\mcode}[2][]{\varcode{#2}\ifthenelse{\equal{#1}{}}{\protect\sindex[mcode]{#2}}{\protect\sindex[mcode]{#1}}} \newcommand{\mcode}[2][]{\varcode{#2}\ifthenelse{\equal{#1}{}}{\protect\sindex[mcode]{#2}}{\protect\sindex[mcode]{#1}}}
% XXX typeset code and put it into matlab index:
% THIS SHOULD actually take both the matlab and the python code!
\newcommand{\code}[2][]{\varcode{#2}\ifthenelse{\equal{#1}{}}{\protect\sindex[mcode]{#2}}{\protect\sindex[mcode]{#1}}}
% the name of the python language:
\newcommand{\python}{Python} \newcommand{\python}{Python}
% the name of the matlab language:
\newcommand{\matlab}{\texorpdfstring{MATLAB$^{\copyright}$}{MATLAB}} \newcommand{\matlab}{\texorpdfstring{MATLAB$^{\copyright}$}{MATLAB}}
\newcommand{\pythonfun}[2][]{(\tr{\python-function}{\python-Funktion} \setlength{\fboxsep}{0.5ex}\colorbox{codeback}{\texttt{#2}})\ifthenelse{\equal{#1}{}}{\protect\sindex[pcode]{#2}}{\protect\sindex[pcode]{#1}}} % typeset '(python-function #1)' and add the function to the python index:
\newcommand{\pythonfun}[1]{(\tr{\python-function}{\python-Funktion} \varcode{#1})\protect\sindex[pcode]{#1}}
\newcommand{\matlabfun}[2][]{(\tr{\matlab-function}{\matlab-Funktion} \setlength{\fboxsep}{0.5ex}\colorbox{codeback}{\texttt{#2}})\ifthenelse{\equal{#1}{}}{\protect\sindex[mcode]{#2}}{\protect\sindex[mcode]{#1}}} % typeset '(matlab-function #1)' and add the function to the matlab index:
\newcommand{\matlabfun}[1]{(\tr{\matlab-function}{\matlab-Funktion} \varcode{#1})\protect\sindex[mcode]{#1}}
%%%%% shortquote and widequote commands: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%% shortquote and widequote commands: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

View File

@ -1,54 +1,111 @@
import numpy as np import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
rng = np.random.RandomState(981) #rng = np.random.RandomState(981)
x = rng.randn(40, 10) + 4.0 #data = rng.randn(40, 1) + 4.0
rng = np.random.RandomState(1981)
data = rng.gamma(1.0, 1.5, 40) + 1.0
data = data[data<7.5]
xpos = 0.08
ypos = 0.15
width = 0.65
height = 0.8
plt.xkcd() plt.xkcd()
fig = plt.figure( figsize=(6,3.4) ) fig = plt.figure( figsize=(6,3.4) )
ax = fig.add_subplot(1, 1, 1)
ax = fig.add_axes([xpos, ypos, width, height])
ax.spines['right'].set_visible(False) ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False) ax.spines['top'].set_visible(False)
ax.yaxis.set_ticks_position('left') ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom') ax.xaxis.set_ticks_position('bottom')
ax.set_xlabel('Experiment') ax.set_xticklabels([])
ax.set_xlim(0.0, 4.8)
ax.set_ylabel('x') ax.set_ylabel('x')
ax.set_ylim( 0.0, 8.0) ax.set_ylim( 0.0, 8.0)
ax.scatter(0.5+rng.rand(len(x[:, 5])), x[:, 5], s=50) barwidth = 0.8
ax.bar([2.0], [np.mean(x[:, 5])], 1.0, yerr=[np.std(x[:, 5])],
ecolor='k', capsize=0, error_kw={'elinewidth':5}) scatterpos = 1.0
barpos = 2.5
## ax.annotate('Median', boxpos = 4.0
## xy=(3.9, 0.0), xycoords='data',
## xytext=(3.5, -2.7), textcoords='data', ha='right', ax.set_xticks([scatterpos, barpos, boxpos])
## arrowprops=dict(arrowstyle="->", relpos=(0.8,1.0), ax.set_xticklabels(['(1) data', '(2) bar\n plot', '(3) box-\nwhisker'])
## connectionstyle="angle3,angleA=-110,angleB=60") )
## ax.annotate('1. quartile', ax.scatter(scatterpos-0.5*barwidth+rng.rand(len(data)), data, s=50)
## xy=(5.8, -0.9), xycoords='data',
## xytext=(5.5, -3.4), textcoords='data', ha='right', barmean = np.mean(data)
## arrowprops=dict(arrowstyle="->", relpos=(0.9,1.0), barstd = np.std(data)
## connectionstyle="angle3,angleA=30,angleB=70") ) ew = 0.2
## ax.annotate('3. quartile', ax.bar([barpos-0.5*barwidth], [barmean], barwidth, color='#FFCC00')
## xy=(6.1, 1.1), xycoords='data', eargs = {'color': 'k', 'lw': 2}
## xytext=(6.5, 3.0), textcoords='data', ha='left', ax.plot([barpos, barpos], [barmean-barstd, barmean+barstd], **eargs)
## arrowprops=dict(arrowstyle="->", relpos=(0.0,0.0), ax.plot([barpos-0.5*ew, barpos+0.5*ew], [barmean-barstd, barmean-barstd], **eargs)
## connectionstyle="angle3,angleA=30,angleB=70") ) ax.plot([barpos-0.5*ew, barpos+0.5*ew], [barmean+barstd, barmean+barstd], **eargs)
## ax.annotate('minimum', ax.annotate('mean',
## xy=(6.1, -1.9), xycoords='data', xy=(barpos-0.4*barwidth, 2.7), xycoords='data',
## xytext=(7.2, -3.3), textcoords='data', ha='left', xytext=(barpos-1*barwidth, 5.5), textcoords='data', ha='left',
## arrowprops=dict(arrowstyle="->", relpos=(0.0,0.5), arrowprops=dict(arrowstyle="->", relpos=(1.0,0.5),
## connectionstyle="angle3,angleA=10,angleB=100") ) connectionstyle="angle3,angleA=0,angleB=120") )
## ax.annotate('maximum', ax.annotate('mean plus\nstd. dev.',
## xy=(5.9, 2.7), xycoords='data', xy=(barpos+0.05*barwidth, 4.2), xycoords='data',
## xytext=(4.9, 3.5), textcoords='data', ha='right', xytext=(barpos-1*barwidth, 7.0), textcoords='data', ha='left',
## arrowprops=dict(arrowstyle="->", relpos=(1.0,0.5), arrowprops=dict(arrowstyle="->", relpos=(0.5,0.0),
## connectionstyle="angle3,angleA=0,angleB=120") ) connectionstyle="angle3,angleA=-60,angleB=80") )
#ax.boxplot( x[:, 5], positions=[4.0], whis=100.0 )
#ax.boxplot( x[:, 5], positions=[4.0], widths=[1.0], whis=100.0, manage_xticks=False ) ax = fig.add_axes([xpos, ypos, width, height], axis_bgcolor='none')
ax.boxplot( x[:, 5], positions=[4.0], widths=[1.0], whis=100.0 ) ax.spines['right'].set_visible(False)
ax.set_xlim(0.0, 5.0) ax.spines['top'].set_visible(False)
ax.set_xticks([1, 3, 5], ['a', 'b', 'c']) ax.spines['left'].set_visible(False)
plt.tight_layout() ax.spines['bottom'].set_visible(False)
ax.xaxis.set_ticks_position('none')
ax.yaxis.set_ticks_position('none')
ax.set_xticklabels([])
ax.set_yticklabels([])
wh = ax.boxplot( data, positions=[boxpos], widths=[barwidth], whis=100.0, patch_artist=True)
wh['medians'][0].set_linewidth(4)
wh['whiskers'][0].set_linewidth(2)
wh['whiskers'][1].set_linewidth(2)
wh['whiskers'][0].set_linestyle('-')
wh['whiskers'][1].set_linestyle('-')
whiskercolor = 'k'
wh['whiskers'][0].set_color(whiskercolor)
wh['whiskers'][1].set_color(whiskercolor)
wh['caps'][0].set_color(whiskercolor)
wh['caps'][1].set_color(whiskercolor)
wh['boxes'][0].set_facecolor('#99ff00')
ax.set_xlim(0.0, 4.8)
ax.set_ylim( 0.0, 8.0)
ax.annotate('maximum',
xy=(boxpos, 6.5), xycoords='data',
xytext=(boxpos-1*barwidth, 7.6), textcoords='data', ha='left',
arrowprops=dict(arrowstyle="->", relpos=(1.0,0.5),
connectionstyle="angle3,angleA=0,angleB=120") )
ax.annotate('3. quartile',
xy=(boxpos-0.3*barwidth, 3.7), xycoords='data',
xytext=(boxpos-1.3*barwidth, 5.5), textcoords='data', ha='left',
arrowprops=dict(arrowstyle="->", relpos=(0.4,0.0),
connectionstyle="angle3,angleA=0,angleB=120") )
ax.annotate('median',
xy=(boxpos+0.6*barwidth, 2.2), xycoords='data',
xytext=(boxpos+0.1*barwidth, 4.2), textcoords='data', ha='left',
arrowprops=dict(arrowstyle="->", relpos=(0.8,0.0),
connectionstyle="angle3,angleA=-60,angleB=20") )
ax = fig.add_axes([xpos+width+0.03, ypos, 0.98-(xpos+width+0.03), height], axis_bgcolor='none')
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')
ax.set_yticklabels([])
ax.set_ylim( 0.0, 8.0)
ax.set_xticks(np.arange(0.0, 0.4, 0.1))
ax.set_xlabel('(4) p(x)')
bw = 0.75
bins = np.arange(0, 8.0+bw, bw)
h, b = np.histogram(data, bins)
ax.barh(b[:-1], h/bw/np.sum(h), bw, color='#CC0000')
plt.savefig('displayunivariatedata.pdf') plt.savefig('displayunivariatedata.pdf')
#plt.show() #plt.show()

View File

@ -7,7 +7,7 @@ x = np.arange( -3.0, 3.0, 0.01 )
g = np.exp(-0.5*x*x)/np.sqrt(2.0*np.pi) g = np.exp(-0.5*x*x)/np.sqrt(2.0*np.pi)
plt.xkcd() plt.xkcd()
fig = plt.figure( figsize=(6,3) ) fig = plt.figure( figsize=(6, 2.8) )
ax = fig.add_subplot(1, 2, 1) ax = fig.add_subplot(1, 2, 1)
ax.spines['right'].set_visible(False) ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False) ax.spines['top'].set_visible(False)
@ -72,6 +72,7 @@ ax.plot(x, g, 'b', lw=4)
ax.plot([m, m], [0.0, 0.38], 'k', lw=2 ) ax.plot([m, m], [0.0, 0.38], 'k', lw=2 )
#ax.plot([gm, gm], [0.0, 0.42], 'k', lw=2 ) #ax.plot([gm, gm], [0.0, 0.42], 'k', lw=2 )
plt.tight_layout() #plt.tight_layout()
plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.4, hspace=0.0)
fig.savefig( 'median.pdf' ) fig.savefig( 'median.pdf' )
#plt.show() #plt.show()

View File

@ -30,6 +30,7 @@ ax.annotate('$P(0<x<1) = \int_0^1 p(x) \, dx$',
connectionstyle="angle3,angleA=10,angleB=80") ) connectionstyle="angle3,angleA=10,angleB=80") )
ax.fill_between( x[(x>x1)&(x<x2)], 0.0, g[(x>x1)&(x<x2)], color='#cc0000' ) ax.fill_between( x[(x>x1)&(x<x2)], 0.0, g[(x>x1)&(x<x2)], color='#cc0000' )
ax.plot(x,g, 'b', lw=4) ax.plot(x,g, 'b', lw=4)
plt.tight_layout() #plt.tight_layout()
plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.4, hspace=0.0)
fig.savefig( 'pdfprobabilities.pdf' ) fig.savefig( 'pdfprobabilities.pdf' )
#plt.show() #plt.show()

View File

@ -18,6 +18,7 @@
\section{TODO} \section{TODO}
\begin{itemize} \begin{itemize}
\item Replace exercise 1.3 (boxwhisker) by one recreating figure 1.
\item Proper introduction to probabilities and densities first! \item Proper introduction to probabilities and densities first!
\item Cumulative propability \item Cumulative propability
\item Kernel Histogramms (important for convolved PSTH)! \item Kernel Histogramms (important for convolved PSTH)!

View File

@ -4,8 +4,8 @@
Descriptive statistics characterizes data sets by means of a few measures. Descriptive statistics characterizes data sets by means of a few measures.
In addition to histograms that visualize the distribution of the data, In addition to histograms that estimate the full distribution of the data,
the following measures are used for characterizing the univariate data: the following measures are used for characterizing univariate data:
\begin{description} \begin{description}
\item[Location, central tendency] (``Lagema{\ss}e''): \item[Location, central tendency] (``Lagema{\ss}e''):
arithmetic mean, median, mode. arithmetic mean, median, mode.
@ -20,16 +20,23 @@ For bivariate and multivariate data sets we can also analyse their
Spearman's rank correlation coefficient. Spearman's rank correlation coefficient.
\end{description} \end{description}
The following is not a complete introduction to descriptive
statistics, but summarizes a few concepts that are most important in
daily data-analysis problems.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Mean, variance, and standard deviation} \section{Mean, variance, and standard deviation}
The \enterm{arithmetic mean} is a measure of location. For $n$ data values The \enterm{arithmetic mean} is a measure of location. For $n$ data values
$x_i$ the arithmetic mean is computed by $x_i$ the arithmetic mean is computed by
\[ \bar x = \langle x \rangle = \frac{1}{N}\sum_{i=1}^n x_i \; . \] \[ \bar x = \langle x \rangle = \frac{1}{N}\sum_{i=1}^n x_i \; . \]
This computation (summing up all elements of a vector and dividing by
the length of the vector) is provided by the function \mcode{mean()}.
The mean has the same unit as the data values. The mean has the same unit as the data values.
The dispersion of the data values around the mean is quantified by The dispersion of the data values around the mean is quantified by
their \enterm{variance} their \enterm{variance}
\[ \sigma^2_x = \langle (x-\langle x \rangle)^2 \rangle = \frac{1}{N}\sum_{i=1}^n (x_i - \bar x)^2 \; . \] \[ \sigma^2_x = \langle (x-\langle x \rangle)^2 \rangle = \frac{1}{N}\sum_{i=1}^n (x_i - \bar x)^2 \; . \]
The variance is computed by the function \mcode{var()}.
The unit of the variance is the unit of the data values squared. The unit of the variance is the unit of the data values squared.
Therefore, variances cannot be compared to the mean or the data values Therefore, variances cannot be compared to the mean or the data values
themselves. In particular, variances cannot be used for plotting error themselves. In particular, variances cannot be used for plotting error
@ -37,13 +44,34 @@ bars along with the mean.
The standard deviation The standard deviation
\[ \sigma_x = \sqrt{\sigma^2_x} \; , \] \[ \sigma_x = \sqrt{\sigma^2_x} \; , \]
however, has the same unit as the data values and can (and should) be as computed by the function \mcode{std()}, however, has the same unit
used to display the dispersion of the data together withtheir mean. as the data values and can (and should) be used to display the
dispersion of the data together with their mean.
The mean of a data set can be displayed by a bar-plot
\matlabfun{bar()}. Additional errorbars \matlabfun{errobar()} can be
used to illustrate the standard deviation of the data
(\figref{displayunivariatedatafig} (2)).
\begin{figure}[t] \begin{figure}[t]
\includegraphics[width=1\textwidth]{displayunivariatedata} \includegraphics[width=1\textwidth]{displayunivariatedata}
\titlecaption{\label{displayunivariatefig} Display univariate \titlecaption{\label{displayunivariatedatafig} Displaying statistics
data.}{Bla.} of univariate data.}{(1) In particular for small data sets it is
most informative to plot the data themselves. The value of each
data point is plotted on the y-axis. To make the data points
overlap less, they are jittered along the x-axis by means of
uniformly distributed random numbers \matlabfun{rand()}. (2) With
a bar plot \matlabfun{bar()} one usually shows the mean of the
data. The additional errorbar illustrates the deviation of the
data from the mean by $\pm$ one standard deviation. (3) A
box-whisker plot \matlabfun{boxplot()} shows more details of the
distribution of the data values. The box extends from the 1. to
the 3. quartile, a horizontal ine within the box marks the median
value, and the whiskers extend to the minum and the maximum data
values. (4) The probability density $p(x)$ estimated from a
normalized histogram shows the entire distribution of the
data. Estimating the probability distribution is only meaningful
for sufficiently large data sets.}
\end{figure} \end{figure}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@ -54,7 +82,7 @@ used to display the dispersion of the data together withtheir mean.
\titlecaption{\label{medianfig} Median, mean and mode of a \titlecaption{\label{medianfig} Median, mean and mode of a
probability distribution.}{Left: Median, mean and mode are probability distribution.}{Left: Median, mean and mode are
identical for the symmetric and unimodal normal distribution. identical for the symmetric and unimodal normal distribution.
Right: for asymmetric distributions these threa measures differ. A Right: for asymmetric distributions these three measures differ. A
heavy tail of a distribution pulls out the mean most strongly. In heavy tail of a distribution pulls out the mean most strongly. In
contrast, the median is more robust against heavy tails, but not contrast, the median is more robust against heavy tails, but not
necessarily identical with the mode.} necessarily identical with the mode.}
@ -66,7 +94,6 @@ The \enterm{median} separates a list of data values into two halves
such that one half of the data is not greater and the other half is such that one half of the data is not greater and the other half is
not smaller than the median (\figref{medianfig}). not smaller than the median (\figref{medianfig}).
\newpage
\begin{exercise}{mymedian.m}{} \begin{exercise}{mymedian.m}{}
Write a function \code{mymedian()} that computes the median of a vector. Write a function \code{mymedian()} that computes the median of a vector.
\end{exercise} \end{exercise}
@ -77,7 +104,11 @@ not smaller than the median (\figref{medianfig}).
Write a script that tests whether your median function really Write a script that tests whether your median function really
returns a median above which are the same number of data than returns a median above which are the same number of data than
below. In particular the script should test data vectors of below. In particular the script should test data vectors of
different length. different length. You should not use the \mcode{median()} function
for testing your function.
Writing tests for your own functions is a very important strategy for
writing reliable code!
\end{exercise} \end{exercise}
\begin{figure}[t] \begin{figure}[t]
@ -103,19 +134,19 @@ data are smaller than the 3$^{\rm rd}$ quartile.
% Write a function that computes the first, second, and third quartile of a vector. % Write a function that computes the first, second, and third quartile of a vector.
% \end{exercise} % \end{exercise}
\begin{figure}[t] % \begin{figure}[t]
\includegraphics[width=1\textwidth]{boxwhisker} % \includegraphics[width=1\textwidth]{boxwhisker}
\titlecaption{\label{boxwhiskerfig} Box-Whisker Plot.}{Box-whisker % \titlecaption{\label{boxwhiskerfig} Box-Whisker Plot.}{Box-whisker
plots are well suited for comparing unimodal distributions. Each % plots are well suited for comparing unimodal distributions. Each
box-whisker characterizes 40 random numbers that have been drawn % box-whisker characterizes 40 random numbers that have been drawn
from a normal distribution.} % from a normal distribution.}
\end{figure} % \end{figure}
\enterm{Box-whisker plots} are commonly used to visualize and compare \enterm{Box-whisker plots} are commonly used to visualize and compare
the distribution of unimodal data. Aa box is drawn around the median the distribution of unimodal data. A box is drawn around the median
that extends from the 1$^{\rm st}$ to the 3$^{\rm rd}$ quartile. The that extends from the 1$^{\rm st}$ to the 3$^{\rm rd}$ quartile. The
whiskers mark the minimum and maximum value of the data set whiskers mark the minimum and maximum value of the data set
(\figref{boxwhiskerfig}). (\figref{displayunivariatedatafig} (3)).
\begin{exercise}{boxwhisker.m}{} \begin{exercise}{boxwhisker.m}{}
Generate eine $40 \times 10$ matrix of random numbers and Generate eine $40 \times 10$ matrix of random numbers and
@ -123,13 +154,17 @@ whiskers mark the minimum and maximum value of the data set
(\code{boxplot()} function). How to interpret the plot? (\code{boxplot()} function). How to interpret the plot?
\end{exercise} \end{exercise}
\section{Histograms} \section{Distributions}
The distribution of values in a data set is estimated by histograms
(\figref{displayunivariatedatafig} (4)).
\subsection{Histograms}
\enterm[Histogram]{Histograms} count the frequency $n_i$ of \enterm[Histogram]{Histograms} count the frequency $n_i$ of
$N=\sum_{i=1}^M n_i$ measurements in $M$ bins $i$. The bins tile the $N=\sum_{i=1}^M n_i$ measurements in each of $M$ bins $i$
data range usually into intervals of the same size. Histograms are (\figref{diehistogramsfig} left). The bins tile the data range
often used to estimate the \enterm{probability distribution} of the usually into intervals of the same size. The width of the bins is
data values. called the bin width.
\begin{figure}[t] \begin{figure}[t]
\includegraphics[width=1\textwidth]{diehistograms} \includegraphics[width=1\textwidth]{diehistograms}
@ -141,14 +176,33 @@ data values.
with the expected theoretical distribution of $P=1/6$.} with the expected theoretical distribution of $P=1/6$.}
\end{figure} \end{figure}
For integer data values (e.g. die number of the faces of a die or the Histograms are often used to estimate the \enterm{probability
number of action potential occurring within a fixed time window) a bin distribution} of the data values.
can be defined for each data value. The histogram is usually
normalized by the total number of measurements to make it \subsection{Probabilities}
independent of size of the data set (\figref{diehistogramsfig}). Then In the frequentist interpretation of probability, the probability of
the height of each histogram bar equals the probability $P(x_i)$ of an event (e.g. getting a six when rolling a die) is the relative
the data value $x_i$ in the $i$-th bin: occurrence of this event in the limit of a large number of trials.
\[ P(x_i) = P_i = \frac{n_i}{N} = \frac{n_i}{\sum_{i=1}^M n_i} \; . \]
For a finite number of trials $N$ where the event $i$ occurred $n_i$
times, the probability $P_i$ of this event is estimated by
\[ P_i = \frac{n_i}{N} = \frac{n_i}{\sum_{i=1}^M n_i} \; . \]
From this definition it follows that a probability is a unitless
quantity that takes on values between zero and one. Most importantly,
the sum of the probabilities of all possible events is one:
\[ \sum_{i=1}^M P_i = \sum_{i=1}^M \frac{n_i}{N} = \frac{1}{N} \sum_{i=1}^M n_i = \frac{N}{N} = 1\; , \]
i.e. the probability of getting any event is one.
\subsection{Probability distributions of categorial data}
For categorial data values (e.g. the faces of a die (as integer
numbers or as colors)) a bin can be defined for each category $i$.
The histogram is normalized by the total number of measurements to
make it independent of the size of the data set
(\figref{diehistogramsfig}). After this normalization the height of
each histogram bar is an estimate of the probability $P_i$ of the
category $i$, i.e. of getting a data value in the $i$-th bin.
\begin{exercise}{rollthedie.m}{} \begin{exercise}{rollthedie.m}{}
Write a function that simulates rolling a die $n$ times. Write a function that simulates rolling a die $n$ times.
@ -162,38 +216,47 @@ the data value $x_i$ in the $i$-th bin:
\end{exercise} \end{exercise}
\section{Probability density functions} \subsection{Probability densities functions}
Meistens haben wir es jedoch mit reellen Messgr\"o{\ss}en zu tun In cases where we deal with data sets of measurements of a real
(z.B. Gewicht von Tigern, L\"ange von Interspikeintervallen). Es quantity (e.g. the length of snakes, the weight of elephants, the time
macht keinen Sinn dem Auftreten jeder einzelnen reelen Zahl eine between succeeding spikes) there is no natural bin width for computing
Wahrscheinlichkeit zuzuordnen, denn die Wahrscheinlichkeit genau den a histogram. In addition, the probability of measuring a data value that
Wert einer bestimmten reelen Zahl, z.B. 1.23456789, zu messen ist equals exactly a specific real number like, e.g., 0.123456789 is zero, because
gleich Null, da es unabz\"ahlbar viele reelle Zahlen gibt. there are uncountable many real numbers.
Sinnvoller ist es dagegen, nach der Wahrscheinlichkeit zu fragen, eine We can only ask for the probability to get a measurement value in some
Zahl aus einem bestimmten Bereich zu erhalten, z.B. die range. For example, we can ask for the probability $P(1.2<x<1.3)$ to
Wahrscheinlichkeit $P(1.2<x<1.3)$, dass die Zahl $x$ einen Wert get a measurement between 0 and 1 (\figref{pdfprobabilitiesfig}). More
zwischen 1.2 und 1.3 hat. generally, we want to know the probability $P(x_0<x<x_1)$ to obtain a
measurement between $x_0$ and $x_1$. If we define the width of the
Im Grenzwert zu sehr kleinen Bereichen $\Delta x$ ist die Wahrscheinlichkeit range defined by $x_0$ and $x_1$ is $\Delta x = x_1 - x_0$ then the
eines Wertes $x$ zwischen $x_0$ und $x_0+\Delta x$ probability can also be expressed as $P(x_0<x<x_0 + \Delta x)$.
\[ P(x_0<x<x_0+\Delta x) \approx p(x) \cdot \Delta x \; . \]
Die Gr\"o{\ss}e $p(x)$ ist eine sogenannte In the limit to very small ranges $\Delta x$ the probability of
\determ{Wahrscheinlichkeitsdichte}. Sie ist keine einheitenlose getting a measurement between $x_0$ and $x_0+\Delta x$ scales down to
Wahrscheinlichkeit mit Werten zwischen Null und Eins, sondern kann zero with $\Delta x$:
jeden positiven Wert annehmen und hat als Einheit den Kehrwert der \[ P(x_0<x<x_0+\Delta x) \approx p(x_0) \cdot \Delta x \; . \]
Einheit von $x$. In here the quantity $p(x_00)$ is a so called \enterm{probability
density}. This is not a unitless probability with values between 0
and 1, but a number that takes on any positive real number and has as
a unit the inverse of the unit of the data values --- hence the name
``density''.
\begin{figure}[t] \begin{figure}[t]
\includegraphics[width=1\textwidth]{pdfprobabilities} \includegraphics[width=1\textwidth]{pdfprobabilities}
\titlecaption{\label{pdfprobabilitiesfig} Wahrscheinlichkeiten bei \titlecaption{\label{pdfprobabilitiesfig} Probability of a
einer Wahrscheinlichkeitsdichtefunktion.}{} probability density.}{The probability of a data value $x$ between,
e.g., zero and one is the integral (red area) over the probability
density (blue).}
\end{figure} \end{figure}
F\"ur beliebige Bereiche ist die Wahrscheinlichkeit f\"ur den Wert $x$ zwischen The probability to get a value $x$ between $x_1$ and $x_2$ is
$x_1$ und $x_2$ gegeben durch given by the integral over the probability density:
\[ P(x_1 < x < x2) = \int\limits_{x_1}^{x_2} p(x) \, dx \; . \] \[ P(x_1 < x < x2) = \int\limits_{x_1}^{x_2} p(x) \, dx \; . \]
Because the probability to get any value $x$ is one, the integral over
the probability density
Da die Wahrscheinlichkeit irgendeines Wertes $x$ Eins ergeben muss gilt die Normierung Da die Wahrscheinlichkeit irgendeines Wertes $x$ Eins ergeben muss gilt die Normierung
\begin{equation} \begin{equation}
\label{pdfnorm} \label{pdfnorm}
@ -215,35 +278,35 @@ Standardabweichung $\sigma$.
\begin{exercise}{gaussianpdf.m}{gaussianpdf.out} \begin{exercise}{gaussianpdf.m}{gaussianpdf.out}
\begin{enumerate} \begin{enumerate}
\item Plotte die Wahrscheinlichkeitsdichte der Normalverteilung $p_g(x)$. \item Plot the probability density of the normal distribution $p_g(x)$.
\item Berechne f\"ur die Normalverteilung mit Mittelwert Null und \item Compute the probability of getting a data value between zero and one
Standardabweichung Eins die Wahrscheinlichkeit, eine Zahl zwischen for the normal distribution with zero mean and standard deviation of one.
0 und 1 zu erhalten. \item Draw 1000 normally distributed random numbers and use these
\item Ziehe 1000 normalverteilte Zufallszahlen und bestimme von numbers to calculate the probability of getting a number between
diesen Zufallzahlen die Wahrscheinlichkeit der Zahlen zwischen zero and one.
Null und Eins. \item Compute from the normal distribution $\int_{-\infty}^{+\infty} p(x) \, dx$.
\item Berechne aus der Normalverteilung $\int_{-\infty}^{+\infty} p(x) \, dx$.
\end{enumerate} \end{enumerate}
\end{exercise} \end{exercise}
\begin{figure}[t] \begin{figure}[t]
\includegraphics[width=1\textwidth]{pdfhistogram} \includegraphics[width=1\textwidth]{pdfhistogram}
\titlecaption{\label{pdfhistogramfig} Histogramme mit verschiedenen \titlecaption{\label{pdfhistogramfig} Histograms with different bin
Klassenbreiten von normalverteilten Messwerten.}{Links: Die H\"ohe widths of normally distributed data.}{Left: The height of the
des absoluten Histogramms h\"angt von der Klassenbreite histogram bars strongly depends on the width of the bins. Right:
ab. Rechts: Bei auf das Integral normierten Histogrammen werden If the histogram is normalized such that its integral is one we
auch unterschiedliche Klassenbreiten untereinander vergleichbar get an estimate of the probability density of the data values.
und auch mit der theoretischen Wahrschinlichkeitsdichtefunktion The normalized histograms are comparable with each other and can
(blau).} also be compared to theoretical probability densities, like the
normal distributions (blue).}
\end{figure} \end{figure}
\pagebreak[4]
\begin{exercise}{gaussianbins.m}{} \begin{exercise}{gaussianbins.m}{}
Draw 100 random data from a Gaussian distribution and plot Draw 100 random data from a Gaussian distribution and plot
histograms with different bin sizes of the data. What do you histograms with different bin sizes of the data. What do you
observe? observe?
\end{exercise} \end{exercise}
\pagebreak[2]
Damit Histogramme von reellen Messwerten trotz unterschiedlicher Damit Histogramme von reellen Messwerten trotz unterschiedlicher
Anzahl von Messungen und unterschiedlicher Klassenbreiten Anzahl von Messungen und unterschiedlicher Klassenbreiten
untereinander vergleichbar werden und mit bekannten untereinander vergleichbar werden und mit bekannten
@ -262,7 +325,6 @@ und das normierte Histogramm hat die H\"ohe
Es muss also nicht nur durch die Summe, sondern auch durch die Breite Es muss also nicht nur durch die Summe, sondern auch durch die Breite
$\Delta x$ der Klassen geteilt werden (\figref{pdfhistogramfig}). $\Delta x$ der Klassen geteilt werden (\figref{pdfhistogramfig}).
\pagebreak[4]
\begin{exercise}{gaussianbinsnorm.m}{} \begin{exercise}{gaussianbinsnorm.m}{}
Normiere das Histogramm der vorherigen \"Ubung zu einer Wahrscheinlichkeitsdichte. Normiere das Histogramm der vorherigen \"Ubung zu einer Wahrscheinlichkeitsdichte.
\end{exercise} \end{exercise}