improved statistics chapter
This commit is contained in:
parent
f362788620
commit
12a417d6bc
17
header.tex
17
header.tex
@ -216,21 +216,30 @@
|
||||
%%%%% code/matlab commands: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\usepackage{textcomp}
|
||||
|
||||
% typeset code inline:
|
||||
\newcommand{\varcode}[1]{\setlength{\fboxsep}{0.5ex}\colorbox{codeback}{\texttt{#1\protect\rule[-0.1ex]{0pt}{1.6ex}}}}
|
||||
|
||||
\newcommand{\code}[2][]{\varcode{#2}\ifthenelse{\equal{#1}{}}{\protect\sindex[mcode]{#2}}{\protect\sindex[mcode]{#1}}}
|
||||
|
||||
% type set code and add it to the python index:
|
||||
\newcommand{\pcode}[2][]{\varcode{#2}\ifthenelse{\equal{#1}{}}{\protect\sindex[pcode]{#2}}{\protect\sindex[pcode]{#1}}}
|
||||
|
||||
% type set code and add it to the matlab index:
|
||||
\newcommand{\mcode}[2][]{\varcode{#2}\ifthenelse{\equal{#1}{}}{\protect\sindex[mcode]{#2}}{\protect\sindex[mcode]{#1}}}
|
||||
|
||||
% XXX typeset code and put it into matlab index:
|
||||
% THIS SHOULD actually take both the matlab and the python code!
|
||||
\newcommand{\code}[2][]{\varcode{#2}\ifthenelse{\equal{#1}{}}{\protect\sindex[mcode]{#2}}{\protect\sindex[mcode]{#1}}}
|
||||
|
||||
% the name of the python language:
|
||||
\newcommand{\python}{Python}
|
||||
|
||||
% the name of the matlab language:
|
||||
\newcommand{\matlab}{\texorpdfstring{MATLAB$^{\copyright}$}{MATLAB}}
|
||||
|
||||
\newcommand{\pythonfun}[2][]{(\tr{\python-function}{\python-Funktion} \setlength{\fboxsep}{0.5ex}\colorbox{codeback}{\texttt{#2}})\ifthenelse{\equal{#1}{}}{\protect\sindex[pcode]{#2}}{\protect\sindex[pcode]{#1}}}
|
||||
% typeset '(python-function #1)' and add the function to the python index:
|
||||
\newcommand{\pythonfun}[1]{(\tr{\python-function}{\python-Funktion} \varcode{#1})\protect\sindex[pcode]{#1}}
|
||||
|
||||
\newcommand{\matlabfun}[2][]{(\tr{\matlab-function}{\matlab-Funktion} \setlength{\fboxsep}{0.5ex}\colorbox{codeback}{\texttt{#2}})\ifthenelse{\equal{#1}{}}{\protect\sindex[mcode]{#2}}{\protect\sindex[mcode]{#1}}}
|
||||
% typeset '(matlab-function #1)' and add the function to the matlab index:
|
||||
\newcommand{\matlabfun}[1]{(\tr{\matlab-function}{\matlab-Funktion} \varcode{#1})\protect\sindex[mcode]{#1}}
|
||||
|
||||
|
||||
%%%%% shortquote and widequote commands: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
@ -1,54 +1,111 @@
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
rng = np.random.RandomState(981)
|
||||
x = rng.randn(40, 10) + 4.0
|
||||
#rng = np.random.RandomState(981)
|
||||
#data = rng.randn(40, 1) + 4.0
|
||||
rng = np.random.RandomState(1981)
|
||||
data = rng.gamma(1.0, 1.5, 40) + 1.0
|
||||
data = data[data<7.5]
|
||||
xpos = 0.08
|
||||
ypos = 0.15
|
||||
width = 0.65
|
||||
height = 0.8
|
||||
|
||||
plt.xkcd()
|
||||
fig = plt.figure( figsize=(6,3.4) )
|
||||
ax = fig.add_subplot(1, 1, 1)
|
||||
|
||||
ax = fig.add_axes([xpos, ypos, width, height])
|
||||
ax.spines['right'].set_visible(False)
|
||||
ax.spines['top'].set_visible(False)
|
||||
ax.yaxis.set_ticks_position('left')
|
||||
ax.xaxis.set_ticks_position('bottom')
|
||||
ax.set_xlabel('Experiment')
|
||||
ax.set_xticklabels([])
|
||||
ax.set_xlim(0.0, 4.8)
|
||||
ax.set_ylabel('x')
|
||||
ax.set_ylim( 0.0, 8.0)
|
||||
ax.scatter(0.5+rng.rand(len(x[:, 5])), x[:, 5], s=50)
|
||||
ax.bar([2.0], [np.mean(x[:, 5])], 1.0, yerr=[np.std(x[:, 5])],
|
||||
ecolor='k', capsize=0, error_kw={'elinewidth':5})
|
||||
|
||||
## ax.annotate('Median',
|
||||
## xy=(3.9, 0.0), xycoords='data',
|
||||
## xytext=(3.5, -2.7), textcoords='data', ha='right',
|
||||
## arrowprops=dict(arrowstyle="->", relpos=(0.8,1.0),
|
||||
## connectionstyle="angle3,angleA=-110,angleB=60") )
|
||||
## ax.annotate('1. quartile',
|
||||
## xy=(5.8, -0.9), xycoords='data',
|
||||
## xytext=(5.5, -3.4), textcoords='data', ha='right',
|
||||
## arrowprops=dict(arrowstyle="->", relpos=(0.9,1.0),
|
||||
## connectionstyle="angle3,angleA=30,angleB=70") )
|
||||
## ax.annotate('3. quartile',
|
||||
## xy=(6.1, 1.1), xycoords='data',
|
||||
## xytext=(6.5, 3.0), textcoords='data', ha='left',
|
||||
## arrowprops=dict(arrowstyle="->", relpos=(0.0,0.0),
|
||||
## connectionstyle="angle3,angleA=30,angleB=70") )
|
||||
## ax.annotate('minimum',
|
||||
## xy=(6.1, -1.9), xycoords='data',
|
||||
## xytext=(7.2, -3.3), textcoords='data', ha='left',
|
||||
## arrowprops=dict(arrowstyle="->", relpos=(0.0,0.5),
|
||||
## connectionstyle="angle3,angleA=10,angleB=100") )
|
||||
## ax.annotate('maximum',
|
||||
## xy=(5.9, 2.7), xycoords='data',
|
||||
## xytext=(4.9, 3.5), textcoords='data', ha='right',
|
||||
## arrowprops=dict(arrowstyle="->", relpos=(1.0,0.5),
|
||||
## connectionstyle="angle3,angleA=0,angleB=120") )
|
||||
#ax.boxplot( x[:, 5], positions=[4.0], whis=100.0 )
|
||||
#ax.boxplot( x[:, 5], positions=[4.0], widths=[1.0], whis=100.0, manage_xticks=False )
|
||||
ax.boxplot( x[:, 5], positions=[4.0], widths=[1.0], whis=100.0 )
|
||||
ax.set_xlim(0.0, 5.0)
|
||||
ax.set_xticks([1, 3, 5], ['a', 'b', 'c'])
|
||||
plt.tight_layout()
|
||||
barwidth = 0.8
|
||||
|
||||
scatterpos = 1.0
|
||||
barpos = 2.5
|
||||
boxpos = 4.0
|
||||
|
||||
ax.set_xticks([scatterpos, barpos, boxpos])
|
||||
ax.set_xticklabels(['(1) data', '(2) bar\n plot', '(3) box-\nwhisker'])
|
||||
|
||||
ax.scatter(scatterpos-0.5*barwidth+rng.rand(len(data)), data, s=50)
|
||||
|
||||
barmean = np.mean(data)
|
||||
barstd = np.std(data)
|
||||
ew = 0.2
|
||||
ax.bar([barpos-0.5*barwidth], [barmean], barwidth, color='#FFCC00')
|
||||
eargs = {'color': 'k', 'lw': 2}
|
||||
ax.plot([barpos, barpos], [barmean-barstd, barmean+barstd], **eargs)
|
||||
ax.plot([barpos-0.5*ew, barpos+0.5*ew], [barmean-barstd, barmean-barstd], **eargs)
|
||||
ax.plot([barpos-0.5*ew, barpos+0.5*ew], [barmean+barstd, barmean+barstd], **eargs)
|
||||
ax.annotate('mean',
|
||||
xy=(barpos-0.4*barwidth, 2.7), xycoords='data',
|
||||
xytext=(barpos-1*barwidth, 5.5), textcoords='data', ha='left',
|
||||
arrowprops=dict(arrowstyle="->", relpos=(1.0,0.5),
|
||||
connectionstyle="angle3,angleA=0,angleB=120") )
|
||||
ax.annotate('mean plus\nstd. dev.',
|
||||
xy=(barpos+0.05*barwidth, 4.2), xycoords='data',
|
||||
xytext=(barpos-1*barwidth, 7.0), textcoords='data', ha='left',
|
||||
arrowprops=dict(arrowstyle="->", relpos=(0.5,0.0),
|
||||
connectionstyle="angle3,angleA=-60,angleB=80") )
|
||||
|
||||
ax = fig.add_axes([xpos, ypos, width, height], axis_bgcolor='none')
|
||||
ax.spines['right'].set_visible(False)
|
||||
ax.spines['top'].set_visible(False)
|
||||
ax.spines['left'].set_visible(False)
|
||||
ax.spines['bottom'].set_visible(False)
|
||||
ax.xaxis.set_ticks_position('none')
|
||||
ax.yaxis.set_ticks_position('none')
|
||||
ax.set_xticklabels([])
|
||||
ax.set_yticklabels([])
|
||||
wh = ax.boxplot( data, positions=[boxpos], widths=[barwidth], whis=100.0, patch_artist=True)
|
||||
wh['medians'][0].set_linewidth(4)
|
||||
wh['whiskers'][0].set_linewidth(2)
|
||||
wh['whiskers'][1].set_linewidth(2)
|
||||
wh['whiskers'][0].set_linestyle('-')
|
||||
wh['whiskers'][1].set_linestyle('-')
|
||||
whiskercolor = 'k'
|
||||
wh['whiskers'][0].set_color(whiskercolor)
|
||||
wh['whiskers'][1].set_color(whiskercolor)
|
||||
wh['caps'][0].set_color(whiskercolor)
|
||||
wh['caps'][1].set_color(whiskercolor)
|
||||
wh['boxes'][0].set_facecolor('#99ff00')
|
||||
ax.set_xlim(0.0, 4.8)
|
||||
ax.set_ylim( 0.0, 8.0)
|
||||
ax.annotate('maximum',
|
||||
xy=(boxpos, 6.5), xycoords='data',
|
||||
xytext=(boxpos-1*barwidth, 7.6), textcoords='data', ha='left',
|
||||
arrowprops=dict(arrowstyle="->", relpos=(1.0,0.5),
|
||||
connectionstyle="angle3,angleA=0,angleB=120") )
|
||||
ax.annotate('3. quartile',
|
||||
xy=(boxpos-0.3*barwidth, 3.7), xycoords='data',
|
||||
xytext=(boxpos-1.3*barwidth, 5.5), textcoords='data', ha='left',
|
||||
arrowprops=dict(arrowstyle="->", relpos=(0.4,0.0),
|
||||
connectionstyle="angle3,angleA=0,angleB=120") )
|
||||
ax.annotate('median',
|
||||
xy=(boxpos+0.6*barwidth, 2.2), xycoords='data',
|
||||
xytext=(boxpos+0.1*barwidth, 4.2), textcoords='data', ha='left',
|
||||
arrowprops=dict(arrowstyle="->", relpos=(0.8,0.0),
|
||||
connectionstyle="angle3,angleA=-60,angleB=20") )
|
||||
|
||||
ax = fig.add_axes([xpos+width+0.03, ypos, 0.98-(xpos+width+0.03), height], axis_bgcolor='none')
|
||||
ax.spines['right'].set_visible(False)
|
||||
ax.spines['top'].set_visible(False)
|
||||
ax.xaxis.set_ticks_position('bottom')
|
||||
ax.yaxis.set_ticks_position('left')
|
||||
ax.set_yticklabels([])
|
||||
ax.set_ylim( 0.0, 8.0)
|
||||
ax.set_xticks(np.arange(0.0, 0.4, 0.1))
|
||||
ax.set_xlabel('(4) p(x)')
|
||||
bw = 0.75
|
||||
bins = np.arange(0, 8.0+bw, bw)
|
||||
h, b = np.histogram(data, bins)
|
||||
ax.barh(b[:-1], h/bw/np.sum(h), bw, color='#CC0000')
|
||||
|
||||
plt.savefig('displayunivariatedata.pdf')
|
||||
#plt.show()
|
||||
|
||||
|
@ -7,7 +7,7 @@ x = np.arange( -3.0, 3.0, 0.01 )
|
||||
g = np.exp(-0.5*x*x)/np.sqrt(2.0*np.pi)
|
||||
|
||||
plt.xkcd()
|
||||
fig = plt.figure( figsize=(6,3) )
|
||||
fig = plt.figure( figsize=(6, 2.8) )
|
||||
ax = fig.add_subplot(1, 2, 1)
|
||||
ax.spines['right'].set_visible(False)
|
||||
ax.spines['top'].set_visible(False)
|
||||
@ -72,6 +72,7 @@ ax.plot(x, g, 'b', lw=4)
|
||||
ax.plot([m, m], [0.0, 0.38], 'k', lw=2 )
|
||||
#ax.plot([gm, gm], [0.0, 0.42], 'k', lw=2 )
|
||||
|
||||
plt.tight_layout()
|
||||
#plt.tight_layout()
|
||||
plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.4, hspace=0.0)
|
||||
fig.savefig( 'median.pdf' )
|
||||
#plt.show()
|
||||
|
@ -30,6 +30,7 @@ ax.annotate('$P(0<x<1) = \int_0^1 p(x) \, dx$',
|
||||
connectionstyle="angle3,angleA=10,angleB=80") )
|
||||
ax.fill_between( x[(x>x1)&(x<x2)], 0.0, g[(x>x1)&(x<x2)], color='#cc0000' )
|
||||
ax.plot(x,g, 'b', lw=4)
|
||||
plt.tight_layout()
|
||||
#plt.tight_layout()
|
||||
plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.4, hspace=0.0)
|
||||
fig.savefig( 'pdfprobabilities.pdf' )
|
||||
#plt.show()
|
||||
|
@ -18,6 +18,7 @@
|
||||
|
||||
\section{TODO}
|
||||
\begin{itemize}
|
||||
\item Replace exercise 1.3 (boxwhisker) by one recreating figure 1.
|
||||
\item Proper introduction to probabilities and densities first!
|
||||
\item Cumulative propability
|
||||
\item Kernel Histogramms (important for convolved PSTH)!
|
||||
|
@ -4,8 +4,8 @@
|
||||
|
||||
Descriptive statistics characterizes data sets by means of a few measures.
|
||||
|
||||
In addition to histograms that visualize the distribution of the data,
|
||||
the following measures are used for characterizing the univariate data:
|
||||
In addition to histograms that estimate the full distribution of the data,
|
||||
the following measures are used for characterizing univariate data:
|
||||
\begin{description}
|
||||
\item[Location, central tendency] (``Lagema{\ss}e''):
|
||||
arithmetic mean, median, mode.
|
||||
@ -20,30 +20,58 @@ For bivariate and multivariate data sets we can also analyse their
|
||||
Spearman's rank correlation coefficient.
|
||||
\end{description}
|
||||
|
||||
The following is not a complete introduction to descriptive
|
||||
statistics, but summarizes a few concepts that are most important in
|
||||
daily data-analysis problems.
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\section{Mean, variance, and standard deviation}
|
||||
The \enterm{arithmetic mean} is a measure of location. For $n$ data values
|
||||
$x_i$ the arithmetic mean is computed by
|
||||
\[ \bar x = \langle x \rangle = \frac{1}{N}\sum_{i=1}^n x_i \; . \]
|
||||
This computation (summing up all elements of a vector and dividing by
|
||||
the length of the vector) is provided by the function \mcode{mean()}.
|
||||
The mean has the same unit as the data values.
|
||||
|
||||
The dispersion of the data values around the mean is quantified by
|
||||
their \enterm{variance}
|
||||
\[ \sigma^2_x = \langle (x-\langle x \rangle)^2 \rangle = \frac{1}{N}\sum_{i=1}^n (x_i - \bar x)^2 \; . \]
|
||||
The variance is computed by the function \mcode{var()}.
|
||||
The unit of the variance is the unit of the data values squared.
|
||||
Therefore, variances cannot be compared to the mean or the data values
|
||||
themselves. In particular, variances cannot be used for plotting error
|
||||
bars along with the mean.
|
||||
|
||||
The standard deviation
|
||||
\[ \sigma_x = \sqrt{\sigma^2_x} \; , \]
|
||||
however, has the same unit as the data values and can (and should) be
|
||||
used to display the dispersion of the data together withtheir mean.
|
||||
\[ \sigma_x = \sqrt{\sigma^2_x} \; , \]
|
||||
as computed by the function \mcode{std()}, however, has the same unit
|
||||
as the data values and can (and should) be used to display the
|
||||
dispersion of the data together with their mean.
|
||||
|
||||
The mean of a data set can be displayed by a bar-plot
|
||||
\matlabfun{bar()}. Additional errorbars \matlabfun{errobar()} can be
|
||||
used to illustrate the standard deviation of the data
|
||||
(\figref{displayunivariatedatafig} (2)).
|
||||
|
||||
\begin{figure}[t]
|
||||
\includegraphics[width=1\textwidth]{displayunivariatedata}
|
||||
\titlecaption{\label{displayunivariatefig} Display univariate
|
||||
data.}{Bla.}
|
||||
\titlecaption{\label{displayunivariatedatafig} Displaying statistics
|
||||
of univariate data.}{(1) In particular for small data sets it is
|
||||
most informative to plot the data themselves. The value of each
|
||||
data point is plotted on the y-axis. To make the data points
|
||||
overlap less, they are jittered along the x-axis by means of
|
||||
uniformly distributed random numbers \matlabfun{rand()}. (2) With
|
||||
a bar plot \matlabfun{bar()} one usually shows the mean of the
|
||||
data. The additional errorbar illustrates the deviation of the
|
||||
data from the mean by $\pm$ one standard deviation. (3) A
|
||||
box-whisker plot \matlabfun{boxplot()} shows more details of the
|
||||
distribution of the data values. The box extends from the 1. to
|
||||
the 3. quartile, a horizontal ine within the box marks the median
|
||||
value, and the whiskers extend to the minum and the maximum data
|
||||
values. (4) The probability density $p(x)$ estimated from a
|
||||
normalized histogram shows the entire distribution of the
|
||||
data. Estimating the probability distribution is only meaningful
|
||||
for sufficiently large data sets.}
|
||||
\end{figure}
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
@ -54,7 +82,7 @@ used to display the dispersion of the data together withtheir mean.
|
||||
\titlecaption{\label{medianfig} Median, mean and mode of a
|
||||
probability distribution.}{Left: Median, mean and mode are
|
||||
identical for the symmetric and unimodal normal distribution.
|
||||
Right: for asymmetric distributions these threa measures differ. A
|
||||
Right: for asymmetric distributions these three measures differ. A
|
||||
heavy tail of a distribution pulls out the mean most strongly. In
|
||||
contrast, the median is more robust against heavy tails, but not
|
||||
necessarily identical with the mode.}
|
||||
@ -66,7 +94,6 @@ The \enterm{median} separates a list of data values into two halves
|
||||
such that one half of the data is not greater and the other half is
|
||||
not smaller than the median (\figref{medianfig}).
|
||||
|
||||
\newpage
|
||||
\begin{exercise}{mymedian.m}{}
|
||||
Write a function \code{mymedian()} that computes the median of a vector.
|
||||
\end{exercise}
|
||||
@ -77,7 +104,11 @@ not smaller than the median (\figref{medianfig}).
|
||||
Write a script that tests whether your median function really
|
||||
returns a median above which are the same number of data than
|
||||
below. In particular the script should test data vectors of
|
||||
different length.
|
||||
different length. You should not use the \mcode{median()} function
|
||||
for testing your function.
|
||||
|
||||
Writing tests for your own functions is a very important strategy for
|
||||
writing reliable code!
|
||||
\end{exercise}
|
||||
|
||||
\begin{figure}[t]
|
||||
@ -103,19 +134,19 @@ data are smaller than the 3$^{\rm rd}$ quartile.
|
||||
% Write a function that computes the first, second, and third quartile of a vector.
|
||||
% \end{exercise}
|
||||
|
||||
\begin{figure}[t]
|
||||
\includegraphics[width=1\textwidth]{boxwhisker}
|
||||
\titlecaption{\label{boxwhiskerfig} Box-Whisker Plot.}{Box-whisker
|
||||
plots are well suited for comparing unimodal distributions. Each
|
||||
box-whisker characterizes 40 random numbers that have been drawn
|
||||
from a normal distribution.}
|
||||
\end{figure}
|
||||
% \begin{figure}[t]
|
||||
% \includegraphics[width=1\textwidth]{boxwhisker}
|
||||
% \titlecaption{\label{boxwhiskerfig} Box-Whisker Plot.}{Box-whisker
|
||||
% plots are well suited for comparing unimodal distributions. Each
|
||||
% box-whisker characterizes 40 random numbers that have been drawn
|
||||
% from a normal distribution.}
|
||||
% \end{figure}
|
||||
|
||||
\enterm{Box-whisker plots} are commonly used to visualize and compare
|
||||
the distribution of unimodal data. Aa box is drawn around the median
|
||||
the distribution of unimodal data. A box is drawn around the median
|
||||
that extends from the 1$^{\rm st}$ to the 3$^{\rm rd}$ quartile. The
|
||||
whiskers mark the minimum and maximum value of the data set
|
||||
(\figref{boxwhiskerfig}).
|
||||
(\figref{displayunivariatedatafig} (3)).
|
||||
|
||||
\begin{exercise}{boxwhisker.m}{}
|
||||
Generate eine $40 \times 10$ matrix of random numbers and
|
||||
@ -123,13 +154,17 @@ whiskers mark the minimum and maximum value of the data set
|
||||
(\code{boxplot()} function). How to interpret the plot?
|
||||
\end{exercise}
|
||||
|
||||
\section{Histograms}
|
||||
\section{Distributions}
|
||||
The distribution of values in a data set is estimated by histograms
|
||||
(\figref{displayunivariatedatafig} (4)).
|
||||
|
||||
\subsection{Histograms}
|
||||
|
||||
\enterm[Histogram]{Histograms} count the frequency $n_i$ of
|
||||
$N=\sum_{i=1}^M n_i$ measurements in $M$ bins $i$. The bins tile the
|
||||
data range usually into intervals of the same size. Histograms are
|
||||
often used to estimate the \enterm{probability distribution} of the
|
||||
data values.
|
||||
$N=\sum_{i=1}^M n_i$ measurements in each of $M$ bins $i$
|
||||
(\figref{diehistogramsfig} left). The bins tile the data range
|
||||
usually into intervals of the same size. The width of the bins is
|
||||
called the bin width.
|
||||
|
||||
\begin{figure}[t]
|
||||
\includegraphics[width=1\textwidth]{diehistograms}
|
||||
@ -141,14 +176,33 @@ data values.
|
||||
with the expected theoretical distribution of $P=1/6$.}
|
||||
\end{figure}
|
||||
|
||||
For integer data values (e.g. die number of the faces of a die or the
|
||||
number of action potential occurring within a fixed time window) a bin
|
||||
can be defined for each data value. The histogram is usually
|
||||
normalized by the total number of measurements to make it
|
||||
independent of size of the data set (\figref{diehistogramsfig}). Then
|
||||
the height of each histogram bar equals the probability $P(x_i)$ of
|
||||
the data value $x_i$ in the $i$-th bin:
|
||||
\[ P(x_i) = P_i = \frac{n_i}{N} = \frac{n_i}{\sum_{i=1}^M n_i} \; . \]
|
||||
Histograms are often used to estimate the \enterm{probability
|
||||
distribution} of the data values.
|
||||
|
||||
\subsection{Probabilities}
|
||||
In the frequentist interpretation of probability, the probability of
|
||||
an event (e.g. getting a six when rolling a die) is the relative
|
||||
occurrence of this event in the limit of a large number of trials.
|
||||
|
||||
For a finite number of trials $N$ where the event $i$ occurred $n_i$
|
||||
times, the probability $P_i$ of this event is estimated by
|
||||
\[ P_i = \frac{n_i}{N} = \frac{n_i}{\sum_{i=1}^M n_i} \; . \]
|
||||
From this definition it follows that a probability is a unitless
|
||||
quantity that takes on values between zero and one. Most importantly,
|
||||
the sum of the probabilities of all possible events is one:
|
||||
\[ \sum_{i=1}^M P_i = \sum_{i=1}^M \frac{n_i}{N} = \frac{1}{N} \sum_{i=1}^M n_i = \frac{N}{N} = 1\; , \]
|
||||
i.e. the probability of getting any event is one.
|
||||
|
||||
|
||||
\subsection{Probability distributions of categorial data}
|
||||
|
||||
For categorial data values (e.g. the faces of a die (as integer
|
||||
numbers or as colors)) a bin can be defined for each category $i$.
|
||||
The histogram is normalized by the total number of measurements to
|
||||
make it independent of the size of the data set
|
||||
(\figref{diehistogramsfig}). After this normalization the height of
|
||||
each histogram bar is an estimate of the probability $P_i$ of the
|
||||
category $i$, i.e. of getting a data value in the $i$-th bin.
|
||||
|
||||
\begin{exercise}{rollthedie.m}{}
|
||||
Write a function that simulates rolling a die $n$ times.
|
||||
@ -162,38 +216,47 @@ the data value $x_i$ in the $i$-th bin:
|
||||
\end{exercise}
|
||||
|
||||
|
||||
\section{Probability density functions}
|
||||
|
||||
Meistens haben wir es jedoch mit reellen Messgr\"o{\ss}en zu tun
|
||||
(z.B. Gewicht von Tigern, L\"ange von Interspikeintervallen). Es
|
||||
macht keinen Sinn dem Auftreten jeder einzelnen reelen Zahl eine
|
||||
Wahrscheinlichkeit zuzuordnen, denn die Wahrscheinlichkeit genau den
|
||||
Wert einer bestimmten reelen Zahl, z.B. 1.23456789, zu messen ist
|
||||
gleich Null, da es unabz\"ahlbar viele reelle Zahlen gibt.
|
||||
|
||||
Sinnvoller ist es dagegen, nach der Wahrscheinlichkeit zu fragen, eine
|
||||
Zahl aus einem bestimmten Bereich zu erhalten, z.B. die
|
||||
Wahrscheinlichkeit $P(1.2<x<1.3)$, dass die Zahl $x$ einen Wert
|
||||
zwischen 1.2 und 1.3 hat.
|
||||
|
||||
Im Grenzwert zu sehr kleinen Bereichen $\Delta x$ ist die Wahrscheinlichkeit
|
||||
eines Wertes $x$ zwischen $x_0$ und $x_0+\Delta x$
|
||||
\[ P(x_0<x<x_0+\Delta x) \approx p(x) \cdot \Delta x \; . \]
|
||||
Die Gr\"o{\ss}e $p(x)$ ist eine sogenannte
|
||||
\determ{Wahrscheinlichkeitsdichte}. Sie ist keine einheitenlose
|
||||
Wahrscheinlichkeit mit Werten zwischen Null und Eins, sondern kann
|
||||
jeden positiven Wert annehmen und hat als Einheit den Kehrwert der
|
||||
Einheit von $x$.
|
||||
\subsection{Probability densities functions}
|
||||
|
||||
In cases where we deal with data sets of measurements of a real
|
||||
quantity (e.g. the length of snakes, the weight of elephants, the time
|
||||
between succeeding spikes) there is no natural bin width for computing
|
||||
a histogram. In addition, the probability of measuring a data value that
|
||||
equals exactly a specific real number like, e.g., 0.123456789 is zero, because
|
||||
there are uncountable many real numbers.
|
||||
|
||||
We can only ask for the probability to get a measurement value in some
|
||||
range. For example, we can ask for the probability $P(1.2<x<1.3)$ to
|
||||
get a measurement between 0 and 1 (\figref{pdfprobabilitiesfig}). More
|
||||
generally, we want to know the probability $P(x_0<x<x_1)$ to obtain a
|
||||
measurement between $x_0$ and $x_1$. If we define the width of the
|
||||
range defined by $x_0$ and $x_1$ is $\Delta x = x_1 - x_0$ then the
|
||||
probability can also be expressed as $P(x_0<x<x_0 + \Delta x)$.
|
||||
|
||||
In the limit to very small ranges $\Delta x$ the probability of
|
||||
getting a measurement between $x_0$ and $x_0+\Delta x$ scales down to
|
||||
zero with $\Delta x$:
|
||||
\[ P(x_0<x<x_0+\Delta x) \approx p(x_0) \cdot \Delta x \; . \]
|
||||
In here the quantity $p(x_00)$ is a so called \enterm{probability
|
||||
density}. This is not a unitless probability with values between 0
|
||||
and 1, but a number that takes on any positive real number and has as
|
||||
a unit the inverse of the unit of the data values --- hence the name
|
||||
``density''.
|
||||
|
||||
\begin{figure}[t]
|
||||
\includegraphics[width=1\textwidth]{pdfprobabilities}
|
||||
\titlecaption{\label{pdfprobabilitiesfig} Wahrscheinlichkeiten bei
|
||||
einer Wahrscheinlichkeitsdichtefunktion.}{}
|
||||
\titlecaption{\label{pdfprobabilitiesfig} Probability of a
|
||||
probability density.}{The probability of a data value $x$ between,
|
||||
e.g., zero and one is the integral (red area) over the probability
|
||||
density (blue).}
|
||||
\end{figure}
|
||||
|
||||
F\"ur beliebige Bereiche ist die Wahrscheinlichkeit f\"ur den Wert $x$ zwischen
|
||||
$x_1$ und $x_2$ gegeben durch
|
||||
The probability to get a value $x$ between $x_1$ and $x_2$ is
|
||||
given by the integral over the probability density:
|
||||
\[ P(x_1 < x < x2) = \int\limits_{x_1}^{x_2} p(x) \, dx \; . \]
|
||||
Because the probability to get any value $x$ is one, the integral over
|
||||
the probability density
|
||||
|
||||
Da die Wahrscheinlichkeit irgendeines Wertes $x$ Eins ergeben muss gilt die Normierung
|
||||
\begin{equation}
|
||||
\label{pdfnorm}
|
||||
@ -215,35 +278,35 @@ Standardabweichung $\sigma$.
|
||||
|
||||
\begin{exercise}{gaussianpdf.m}{gaussianpdf.out}
|
||||
\begin{enumerate}
|
||||
\item Plotte die Wahrscheinlichkeitsdichte der Normalverteilung $p_g(x)$.
|
||||
\item Berechne f\"ur die Normalverteilung mit Mittelwert Null und
|
||||
Standardabweichung Eins die Wahrscheinlichkeit, eine Zahl zwischen
|
||||
0 und 1 zu erhalten.
|
||||
\item Ziehe 1000 normalverteilte Zufallszahlen und bestimme von
|
||||
diesen Zufallzahlen die Wahrscheinlichkeit der Zahlen zwischen
|
||||
Null und Eins.
|
||||
\item Berechne aus der Normalverteilung $\int_{-\infty}^{+\infty} p(x) \, dx$.
|
||||
\item Plot the probability density of the normal distribution $p_g(x)$.
|
||||
\item Compute the probability of getting a data value between zero and one
|
||||
for the normal distribution with zero mean and standard deviation of one.
|
||||
\item Draw 1000 normally distributed random numbers and use these
|
||||
numbers to calculate the probability of getting a number between
|
||||
zero and one.
|
||||
\item Compute from the normal distribution $\int_{-\infty}^{+\infty} p(x) \, dx$.
|
||||
\end{enumerate}
|
||||
\end{exercise}
|
||||
|
||||
\begin{figure}[t]
|
||||
\includegraphics[width=1\textwidth]{pdfhistogram}
|
||||
\titlecaption{\label{pdfhistogramfig} Histogramme mit verschiedenen
|
||||
Klassenbreiten von normalverteilten Messwerten.}{Links: Die H\"ohe
|
||||
des absoluten Histogramms h\"angt von der Klassenbreite
|
||||
ab. Rechts: Bei auf das Integral normierten Histogrammen werden
|
||||
auch unterschiedliche Klassenbreiten untereinander vergleichbar
|
||||
und auch mit der theoretischen Wahrschinlichkeitsdichtefunktion
|
||||
(blau).}
|
||||
\titlecaption{\label{pdfhistogramfig} Histograms with different bin
|
||||
widths of normally distributed data.}{Left: The height of the
|
||||
histogram bars strongly depends on the width of the bins. Right:
|
||||
If the histogram is normalized such that its integral is one we
|
||||
get an estimate of the probability density of the data values.
|
||||
The normalized histograms are comparable with each other and can
|
||||
also be compared to theoretical probability densities, like the
|
||||
normal distributions (blue).}
|
||||
\end{figure}
|
||||
|
||||
\pagebreak[4]
|
||||
\begin{exercise}{gaussianbins.m}{}
|
||||
Draw 100 random data from a Gaussian distribution and plot
|
||||
histograms with different bin sizes of the data. What do you
|
||||
observe?
|
||||
\end{exercise}
|
||||
|
||||
\pagebreak[2]
|
||||
Damit Histogramme von reellen Messwerten trotz unterschiedlicher
|
||||
Anzahl von Messungen und unterschiedlicher Klassenbreiten
|
||||
untereinander vergleichbar werden und mit bekannten
|
||||
@ -262,7 +325,6 @@ und das normierte Histogramm hat die H\"ohe
|
||||
Es muss also nicht nur durch die Summe, sondern auch durch die Breite
|
||||
$\Delta x$ der Klassen geteilt werden (\figref{pdfhistogramfig}).
|
||||
|
||||
\pagebreak[4]
|
||||
\begin{exercise}{gaussianbinsnorm.m}{}
|
||||
Normiere das Histogramm der vorherigen \"Ubung zu einer Wahrscheinlichkeitsdichte.
|
||||
\end{exercise}
|
||||
|
Reference in New Issue
Block a user