added section on cumulative densities

2017-11-25 18:46:03 +01:00 · 2017-11-25 18:46:03 +01:00 · 6c95ec7256
commit 6c95ec7256
parent 12a417d6bc
13 changed files with 381 additions and 76 deletions
--- a/bootstrap/lecture/bootstrap.tex
+++ b/bootstrap/lecture/bootstrap.tex
@ -1,6 +1,7 @@
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \chapter{\tr{Bootstrap Methods}{Bootstrap Methoden}}
+\label{bootstrapchapter}

 Beim \determ{Bootstrap} erzeugt man sich die Verteilung von Statistiken durch Resampling
 aus der Stichprobe. Das hat mehrere Vorteile:
--- a/scientificcomputing-script.tex
+++ b/scientificcomputing-script.tex
@ -46,7 +46,7 @@
 \include{programmingstyle/lecture/programmingstyle}

 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\part{Grundlagen der Datenanalyse}
+\part{Data analysis}

 \graphicspath{{statistics/lecture/}{statistics/lecture/figures/}}
 \lstset{inputpath=statistics/code}
--- a/statistics/code/cumulative.m
+++ b/statistics/code/cumulative.m
@ -0,0 +1,18 @@
+x = randn(200, 1);                    % generate some data
+xs = sort(x);                         % sort the data
+cdf = [1:length(x)]/length(x);        % cumulative
+plot(xs, cdf);
+hold on;
+
+dx = 0.01;
+xx = [-4:dx:4];                     % x-values for Gaussian pdf
+gauss = exp(-0.5*xx.^2)/sqrt(2.0*pi);  % Gaussian pdf
+gausscdf = cumsum(gauss)*dx;
+plot(xx, gausscdf);
+hold off;
+
+printf('data : probability of x<-1: %.2f\n', cdf(xs<-1.0)(end))
+printf('gauss: probability of x<-1: %.2f\n', gausscdf(xx<-1.0)(end))
+printf('\n')
+printf('data : 5%% percentile at %.2f\n', xs(cdf<0.05)(end))
+printf('gauss: 5%% percentile at %.2f\n', xx(gausscdf<0.05)(end))
--- a/statistics/code/gaussianbins.m
+++ b/statistics/code/gaussianbins.m
@ -1,11 +1,11 @@
 x = randn(100, 1);   % generate some data
-
-bins1 = -4:2:4;      % large bins
-bins2 = -4:0.5:4;    % small bins
+db1 = 2;             % large bin width
+db2 = 0.5;           % small bin width
+bins1 = -4:db1:4;    % large bins
+bins2 = -4:db2:4;    % small bins
 [h1,b1] = hist(x, bins1);
 [h2,b2] = hist(x, bins2);

-subplot( 1, 2, 1 );
 bar(b1, h1)
 hold on 
 bar(b2, h2, 'facecolor', 'r' )
--- a/statistics/code/gaussianbinsnorm.m
+++ b/statistics/code/gaussianbinsnorm.m
@ -1,6 +1,6 @@
 hn1 = h1/sum(h1)/db1;
 hn2 = h2/sum(h2)/db2;
-subplot(  1, 2, 2 )
+
 bar(b1, hn1)
 hold on 
 bar(b2, hn2, 'facecolor', 'r' )
--- a/statistics/code/gaussiankerneldensity.m
+++ b/statistics/code/gaussiankerneldensity.m
@ -0,0 +1,42 @@
+data = randn(100, 1);            % generate some data
+sigma = 0.2;                     % standard deviation of Gaussian kernel
+xmin = -4.0;                     % minimum x value for kernel density
+xmax = 4.0;                      % maximum x value for kernel density
+dx = 0.05*sigma;                 % step size for kernel density
+xg = [-4.0*sigma:dx:4.0*sigma];  % x-axis for single Gaussian kernel  
+% single Gaussian kernel:
+kernel = exp(-0.5*(xg/sigma).^2)/sqrt(2.0*pi)/sigma;
+ng = (length(kernel)-1)/2;           % half the length of the Gaussian
+x = [xmin:dx:xmax+0.5*dx];       % x-axis for kernel density
+kd = zeros(1, length(x));        % vector for kernel density
+for i = 1:length(data)           % for every data value ...                
+  xd = data(i);
+  % index of data value in kernel density vector:
+  inx = round((xd-xmin)/dx)+1;
+  % start index for Gaussian in kernel density vector:
+  k0 = inx-ng;
+  % end index for Gaussian in kernel density vector:
+  k1 = inx+ng;
+  g0 = 1;                        % start index in Gaussian
+  g1 = length(kernel);            % end index in Gaussian
+  % check whether left side of Gaussian extends below xmin:
+  if inx < ng+1
+    % adjust start indices accordingly:
+    k0 = 1;
+    g0 = ng-inx+1;
+  end
+  % check whether right side of Gaussian extends above xmax:
+  if inx > length(kd)-ng
+    % adjust end indices accordingly:
+    k1 = length(kd);
+    g1 = length(kernel)-(inx+ng-length(kd));
+  end
+  % add Gaussian on kernel density:
+  kd(k0:k1) = kd(k0:k1) + kernel(g0:g1);
+end
+kd /= length(data);               % normalize by number of data points
+
+% plot kernel density:
+plot(x, kd)
+xlabel('x')
+ylabel('Probability density')
--- a/statistics/lecture/cumulative.py
+++ b/statistics/lecture/cumulative.py
@ -0,0 +1,52 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+# data:
+rng = np.random.RandomState(981)
+data = rng.randn(100)
+xs = np.sort(data)
+cdf = np.arange(len(xs))/float(len(xs))
+
+# Gauss:
+dx = 0.01
+xx = np.arange(-4.0, 4.0, dx)
+gauss = np.exp(-0.5*xx*xx)/np.sqrt(2.0*np.pi)
+gausscdf = np.cumsum(gauss)*dx
+
+# plot:
+plt.xkcd()
+fig = plt.figure( figsize=(6, 2.6) )
+ax = fig.add_subplot(1, 1, 1)
+ax.spines['right'].set_visible(False)
+ax.spines['top'].set_visible(False)
+ax.yaxis.set_ticks_position('left')
+ax.xaxis.set_ticks_position('bottom')
+ax.set_xlabel( 'x' )
+ax.set_xlim(-3.2, 3.2)
+ax.set_xticks( np.arange( -3.0, 3.1, 1.0 ) )
+ax.set_ylabel( 'F(x)' )
+ax.set_ylim(-0.05, 1.05)
+ax.set_yticks( np.arange( 0.0, 1.1, 0.2 ) )
+
+med = xs[cdf>=0.5][0]
+ax.plot([-3.2, med, med], [0.5, 0.5, 0.0], 'k', lw=1, zorder=-5)
+ax.text(-2.8, 0.55, 'F=0.5')
+ax.text(0.15, 0.25, 'median at %.2f' % med)
+
+q3 = xs[cdf>=0.75][0]
+ax.plot([-3.2, q3, q3], [0.75, 0.75, 0.0], 'k', lw=1, zorder=-5)
+ax.text(-2.8, 0.8, 'F=0.75')
+ax.text(0.8, 0.5, '3. quartile at %.2f' % q3)
+
+p = cdf[xs>=-1.0][0]
+ax.plot([-3.2, -1.0, -1.0], [p, p, 0.0], 'k', lw=1, zorder=-5)
+ax.text(-2.8, 0.2, 'F=%.2f' % p)
+ax.text(-0.9, 0.05, '-1')
+
+ax.plot(xx, gausscdf, '-', color='#0000ff', lw=2, zorder=-1)
+ax.plot(xs, cdf, '-', color='#cc0000', lw=4, zorder=-1)
+ax.plot([-3.2, 3.2], [1.0, 1.0], '--', color='k', lw=2, zorder=-10)
+
+plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.35, hspace=0.3)
+fig.savefig( 'cumulative.pdf' )
+#plt.show()
--- a/statistics/lecture/diehistograms.py
+++ b/statistics/lecture/diehistograms.py
@ -34,6 +34,6 @@ ax.set_ylim(0, 0.23)
 ax.set_ylabel( 'Probability' )
 ax.plot([0.2, 6.8], [1.0/6.0, 1.0/6.0], '-b', lw=2, zorder=1)
 ax.hist([x2, x1], bins, normed=True, color=['#FFCC00', '#FFFF66' ], zorder=10)
-plt.tight_layout()
+plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.4, hspace=0.0)
 fig.savefig( 'diehistograms.pdf' )
 #plt.show()
--- a/statistics/lecture/kerneldensity.py
+++ b/statistics/lecture/kerneldensity.py
@ -0,0 +1,83 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+# normal distribution:
+rng = np.random.RandomState(6281)
+x = np.arange( -4.0, 4.0, 0.01 )
+g = np.exp(-0.5*x*x)/np.sqrt(2.0*np.pi)
+r = rng.randn(100)
+
+def kerneldensity(data, xmin, xmax, sigma=1.0) :
+    dx = 0.05*sigma
+    xg = np.arange(-4.0*sigma, 4.0*sigma + 0.5*dx, dx)
+    gauss = np.exp(-0.5*xg*xg/sigma/sigma)/np.sqrt(2.0*np.pi)/sigma
+    ng = len(gauss)/2
+    x = np.arange(xmin, xmax+0.5*dx, dx)
+    kd = np.zeros(len(x))
+    for xd in data:
+        inx = int((xd-xmin)/dx)
+        k0 = inx-ng
+        k1 = inx+ng+1
+        g0 = 0
+        g1 = len(gauss)
+        if inx < ng:
+            k0 = 0
+            g0 = ng-inx
+        if inx >= len(kd)-ng:
+            k1 = len(kd)
+            g1 = len(gauss)-(inx+ng-len(kd)+1)
+        kd[k0:k1] += gauss[g0:g1]
+    kd /= len(data)
+    return kd, x
+    
+
+plt.xkcd()
+
+fig = plt.figure( figsize=(6,3) )
+ax = fig.add_subplot(2, 2, 1)
+ax.spines['right'].set_visible(False)
+ax.spines['top'].set_visible(False)
+ax.yaxis.set_ticks_position('left')
+ax.xaxis.set_ticks_position('bottom')
+ax.set_xlabel( 'x' )
+ax.set_xlim(-3.2, 3.2)
+ax.set_xticks( np.arange( -3.0, 3.1, 1.0 ) )
+ax.set_ylabel( 'p(x)' )
+ax.set_ylim(0.0, 0.49)
+ax.set_yticks( np.arange( 0.0, 0.41, 0.1 ) )
+#ax.plot(x, g, '-b', lw=2, zorder=-1)
+ax.hist(r, np.arange(-4.1, 4, 0.4), normed=True, color='#FFCC00', zorder=-5)
+
+ax = fig.add_subplot(2, 2, 3)
+ax.spines['right'].set_visible(False)
+ax.spines['top'].set_visible(False)
+ax.yaxis.set_ticks_position('left')
+ax.xaxis.set_ticks_position('bottom')
+ax.set_xlabel( 'x' )
+ax.set_xlim(-3.2, 3.2)
+ax.set_xticks( np.arange( -3.0, 3.1, 1.0 ) )
+ax.set_ylabel( 'p(x)' )
+ax.set_ylim(0.0, 0.49)
+ax.set_yticks( np.arange( 0.0, 0.41, 0.1 ) )
+#ax.plot(x, g, '-b', lw=2, zorder=-1)
+ax.hist(r, np.arange(-4.3, 4, 0.4), normed=True, color='#FFCC00', zorder=-5)
+
+ax = fig.add_subplot(1, 2, 2)
+ax.spines['right'].set_visible(False)
+ax.spines['top'].set_visible(False)
+ax.yaxis.set_ticks_position('left')
+ax.xaxis.set_ticks_position('bottom')
+ax.set_xlabel( 'x' )
+ax.set_xlim(-3.2, 3.2)
+ax.set_xticks( np.arange( -3.0, 3.1, 1.0 ) )
+ax.set_ylabel( 'Probab. density p(x)' )
+ax.set_ylim(0.0, 0.49)
+ax.set_yticks( np.arange( 0.0, 0.41, 0.1 ) )
+kd, xx = kerneldensity(r, -3.2, 3.2, 0.2)
+ax.fill_between(xx, 0.0, kd, color='#FF9900', zorder=-5)
+ax.plot(xx, kd, '-', lw=3, color='#CC0000', zorder=-1)
+
+plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.35, hspace=0.3)
+fig.savefig( 'kerneldensity.pdf' )
+#plt.show()
+
--- a/statistics/lecture/pdfhistogram.py
+++ b/statistics/lecture/pdfhistogram.py
@ -38,7 +38,7 @@ ax.plot(x, g, '-b', lw=2, zorder=-1)
 ax.hist(r, 5, normed=True, color='#CC0000', zorder=-10)
 ax.hist(r, 20, normed=True, color='#FFCC00', zorder=-5)

-plt.tight_layout()
+plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.4, hspace=0.0)
 fig.savefig( 'pdfhistogram.pdf' )
 #plt.show()

--- a/statistics/lecture/quartile.py
+++ b/statistics/lecture/quartile.py
@ -7,7 +7,7 @@ g = np.exp(-0.5*x*x)/np.sqrt(2.0*np.pi)
 q = [ -0.67488, 0.0, 0.67488 ]

 plt.xkcd()
-fig = plt.figure( figsize=(6,3.4) )
+fig = plt.figure( figsize=(6,3.2) )
 ax = fig.add_subplot( 1, 1, 1 )
 ax.spines['right'].set_visible(False)
 ax.spines['top'].set_visible(False)
@ -44,6 +44,7 @@ ax.plot(x,g, 'b', lw=4)
 ax.plot([0.0, 0.0], [0.0, 0.45], 'k', lw=2 )
 ax.plot([q[0], q[0]], [0.0, 0.4], 'k', lw=2 )
 ax.plot([q[2], q[2]], [0.0, 0.4], 'k', lw=2 )
-plt.tight_layout()
+plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.4, hspace=0.0)
+#plt.tight_layout()
 fig.savefig( 'quartile.pdf' )
 #plt.show()
--- a/statistics/lecture/statistics-chapter.tex
+++ b/statistics/lecture/statistics-chapter.tex
@ -19,9 +19,6 @@
 \section{TODO}
 \begin{itemize}
 \item Replace exercise 1.3 (boxwhisker) by one recreating figure 1.
-\item Proper introduction to probabilities and densities first!
-\item Cumulative propability
-\item Kernel Histogramms (important for convolved PSTH)!
 \end{itemize}

 \end{document}
--- a/statistics/lecture/statistics.tex
+++ b/statistics/lecture/statistics.tex
@ -80,12 +80,12 @@ used to illustrate the standard deviation of the data
 \begin{figure}[t]
  \includegraphics[width=1\textwidth]{median}
  \titlecaption{\label{medianfig} Median, mean and mode of a
-    probability distribution.}{Left: Median, mean and mode are
-    identical for the symmetric and unimodal normal distribution.
-    Right: for asymmetric distributions these three measures differ. A
-    heavy tail of a distribution pulls out the mean most strongly. In
-    contrast, the median is more robust against heavy tails, but not
-    necessarily identical with the mode.}
+    probability distribution.}{Left: Median, mean and mode coincide
+    for the symmetric and unimodal normal distribution.  Right: for
+    asymmetric distributions these three measures differ. A heavy tail
+    of a distribution pulls out the mean most strongly. In contrast,
+    the median is more robust against heavy tails, but not necessarily
+    identical with the mode.}
 \end{figure}

 The \enterm{mode} is the most frequent value, i.e. the position of the maximum of the probability distribution.
@ -113,7 +113,10 @@ not smaller than the median (\figref{medianfig}).

 \begin{figure}[t]
  \includegraphics[width=1\textwidth]{quartile}
-  \titlecaption{\label{quartilefig} Median and quartiles of a normal distribution.}{}
+  \titlecaption{\label{quartilefig} Median and quartiles of a normal
+    distribution.}{ The interquartile range between the first and the
+    third quartile contains 50\,\% of the data and contains the
+    median.}
 \end{figure}

 The distribution of data can be further characterized by the position
@ -164,7 +167,9 @@ The distribution of values in a data set is estimated by histograms
 $N=\sum_{i=1}^M n_i$ measurements in each of $M$ bins $i$
 (\figref{diehistogramsfig} left).  The bins tile the data range
 usually into intervals of the same size. The width of the bins is
-called the bin width.
+called the bin width. The frequencies $n_i$ plotted against the
+categories $i$ is the \enterm{histogram}, or the \enterm{frequency
+  histogram}.

 \begin{figure}[t]
  \includegraphics[width=1\textwidth]{diehistograms}
@ -219,7 +224,7 @@ category $i$, i.e. of getting a data value in the $i$-th bin.
 \subsection{Probability densities functions}

 In cases where we deal with data sets of measurements of a real
-quantity (e.g. the length of snakes, the weight of elephants, the time
+quantity (e.g. lengths of snakes, weights of elephants, times
 between succeeding spikes) there is no natural bin width for computing
 a histogram. In addition, the probability of measuring a data value that
 equals exactly a specific real number like, e.g., 0.123456789 is zero, because
@ -230,7 +235,7 @@ range.  For example, we can ask for the probability $P(1.2<x<1.3)$ to
 get a measurement between 0 and 1 (\figref{pdfprobabilitiesfig}). More
 generally, we want to know the probability $P(x_0<x<x_1)$ to obtain a
 measurement between $x_0$ and $x_1$. If we define the width of the
-range defined by $x_0$ and $x_1$ is $\Delta x = x_1 - x_0$ then the
+range between $x_0$ and $x_1$ as $\Delta x = x_1 - x_0$ then the
 probability can also be expressed as $P(x_0<x<x_0 + \Delta x)$.

 In the limit to very small ranges $\Delta x$ the probability of
@ -238,44 +243,45 @@ getting a measurement between $x_0$ and $x_0+\Delta x$ scales down to
 zero with $\Delta x$:
 \[ P(x_0<x<x_0+\Delta x) \approx p(x_0) \cdot \Delta x \; . \] 
 In here the quantity $p(x_00)$ is a so called \enterm{probability
-  density}. This is not a unitless probability with values between 0
-and 1, but a number that takes on any positive real number and has as
-a unit the inverse of the unit of the data values --- hence the name
-``density''.
+  density} that is larger than zero and that described the
+distribution of the data values. The probability density is not a
+unitless probability with values between 0 and 1, but a number that
+takes on any positive real number and has as a unit the inverse of the
+unit of the data values --- hence the name ``density''.

 \begin{figure}[t]
  \includegraphics[width=1\textwidth]{pdfprobabilities}
  \titlecaption{\label{pdfprobabilitiesfig} Probability of a
    probability density.}{The probability of a data value $x$ between,
-    e.g., zero and one is the integral (red area) over the probability
+    e.g., zero and one is the integral (red area) of the probability
    density (blue).}
 \end{figure}

 The probability to get a value $x$ between $x_1$ and $x_2$ is 
-given by the integral over the probability density:
+given by the integral of the probability density:
 \[ P(x_1 < x < x2) = \int\limits_{x_1}^{x_2} p(x) \, dx \; . \]
-Because the probability to get any value $x$ is one, the integral over
-the probability density 
-
-Da die Wahrscheinlichkeit irgendeines Wertes $x$ Eins ergeben muss gilt die Normierung
+Because the probability to get any value $x$ is one, the integral of
+the probability density over the whole real axis must be one:
 \begin{equation}
  \label{pdfnorm}
  P(-\infty < x < \infty) = \int\limits_{-\infty}^{+\infty} p(x) \, dx = 1 \; .
 \end{equation}

-\pagebreak[2]
-Die gesamte Funktion $p(x)$, die jedem Wert $x$ einen
-Wahrscheinlichkeitsdichte zuordnet wir auch
-\determ{Wahrscheinlichkeitsdichtefunktion} (\enterm{probability
-  density function}, \enterm[pdf|see{probability density
-  function}]{pdf}, oder kurz \enterm[density|see{probability density
-  function}]{density}) genannt. Die bekannteste
-Wahrscheinlichkeitsdichtefunktion ist die der \determ{Normalverteilung}
-\[ p_g(x) =
-\frac{1}{\sqrt{2\pi\sigma^2}}e^{-\frac{(x-\mu)^2}{2\sigma^2}} \]
--- die \determ{Gau{\ss}sche-Glockenkurve} mit Mittelwert $\mu$ und
-Standardabweichung $\sigma$.
+The function $p(x)$, that assigns to every $x$ a probability density,
+is called \enterm{probability density function},
+\enterm[pdf|see{probability density function}]{pdf}, or just
+\enterm[density|see{probability density function}]{density}
+(\determ{Wahrscheinlichkeitsdichtefunktion}). The well known
+\enterm{normal distribution} (\determ{Normalverteilung}) is an example of a
+probability density function
+\[ p_g(x) = \frac{1}{\sqrt{2\pi\sigma^2}}e^{-\frac{(x-\mu)^2}{2\sigma^2}} \]
+--- the \enterm{Guassian distribution}
+(\determ{Gau{\ss}sche-Glockenkurve}) with mean $\mu$ and standard
+deviation $\sigma$.
+The factor in front of the exponential function ensures the normalization to
+$\int p_g(x) \, dx = 1$, \eqnref{pdfnorm}.

+\newpage
 \begin{exercise}{gaussianpdf.m}{gaussianpdf.out}
  \begin{enumerate}
  \item Plot the probability density of the normal distribution $p_g(x)$.
@ -288,6 +294,38 @@ Standardabweichung $\sigma$.
  \end{enumerate}
 \end{exercise}

+\newpage
+Histograms of real valued data depend on both the number of data
+values and the chosen bin width. As in the example with the die
+(\figref{diehistogramsfig} left), the height of the histogram gets
+larger the larger the size of the data set. Also, as the bin width is
+increased the hight of the histogram increases, because more data
+values fall within each bin (\figref{pdfhistogramfig} left).
+
+\begin{exercise}{gaussianbins.m}{}
+  Draw 100 random data from a Gaussian distribution and plot
+  histograms with different bin sizes of the data. What do you
+  observe?
+\end{exercise}
+
+To turn such histograms to estimates of probability densities they
+need to be normalized such that according to \eqnref{pdfnorm} their
+integral equals one. While histograms of categorial data are
+normalized such that their sum equals one, here we need to integrate
+over the histogram. The integral is the area (not the height) of the
+histogram bars. Each bar has the height $n_i$ and the width $\Delta
+x$.  The total area $A$ of the histogram is thus
+\[ A = \sum_{i=1}^N ( n_i \cdot \Delta x ) = \Delta x \sum_{i=1}^N n_i = N \, \Delta x \]
+and the normalized histogram has the heights
+\[ p(x_i) = \frac{n_i}{\Delta x \sum_{i=1}^N n_i} = \frac{n_i}{N
+  \Delta x} \; .\] 
+A histogram needs to be divided by both the sum of the frequencies
+$n_i$ and the bin width $\Delta x$ to results in an estimate of the
+corresponding probability density. Only then can the distribution be
+compared with other distributions and in particular with theoretical
+probability density functions like the one of the normal distribution
+(\figref{pdfhistogramfig} right).
+
 \begin{figure}[t]
  \includegraphics[width=1\textwidth]{pdfhistogram}
  \titlecaption{\label{pdfhistogramfig} Histograms with different bin
@ -300,36 +338,106 @@ Standardabweichung $\sigma$.
    normal distributions (blue).}
 \end{figure}

-\pagebreak[4]
-\begin{exercise}{gaussianbins.m}{}
-  Draw 100 random data from a Gaussian distribution and plot
-  histograms with different bin sizes of the data. What do you
-  observe?
-\end{exercise}
-
-Damit Histogramme von reellen Messwerten trotz unterschiedlicher
-Anzahl von Messungen und unterschiedlicher Klassenbreiten
-untereinander vergleichbar werden und mit bekannten
-Wahrscheinlichkeitsdichtefunktionen verglichen werden k\"onnen,
-m\"ussen sie auf das Integral Eins normiert werden
-\eqnref{pdfnorm}. Das Integral (nicht die Summe) \"uber das Histogramm
-soll Eins ergeben --- denn die Wahrscheinlichkeit, dass irgendeiner
-der Messwerte auftritt mu{\ss} Eins sein. Das Integral ist die
-Fl\"ache des Histogramms, die sich aus der Fl\"ache der einzelnen
-Histogrammbalken zusammen setzt. Die Balken des Histogramms haben die
-H\"ohe $n_i$ und die Breite $\Delta x$. Die Gesamtfl\"ache $A$ des
-Histogramms ist also
-\[ A = \sum_{i=1}^N ( n_i \cdot \Delta x ) = \Delta x \sum_{i=1}^N n_i \]
-und das normierte Histogramm hat die H\"ohe
-\[ p(x_i) = \frac{n_i}{\Delta x \sum_{i=1}^N n_i} \] 
-Es muss also nicht nur durch die Summe, sondern auch durch die Breite
-$\Delta x$ der Klassen geteilt werden (\figref{pdfhistogramfig}).
-
+\newpage
 \begin{exercise}{gaussianbinsnorm.m}{}
-  Normiere das Histogramm der vorherigen \"Ubung zu einer Wahrscheinlichkeitsdichte.
+  Normalize the histogram of the previous exercise to a probability density.
 \end{exercise}


+\newpage
+\subsection{Kernel densities}
+
+A problem of using histograms for estimating probability densities is
+that the have hard bin edges. Depending on where the bin edges are placed
+a data value falls in one or the other bin. 
+
+\begin{figure}[t]
+  \includegraphics[width=1\textwidth]{kerneldensity}
+  \titlecaption{\label{kerneldensityfig} Kernel densities.}{Left: The
+    histogram-based estimation of the probability density is dependent
+    also on the position of the bins. In the bottom plot the bins have
+    bin shifted by half a bin width (here $\Delta x=0.4$) and as a
+    result details of the probability density look different. Look,
+    for example at the height of the largest bin. Right: In contrast,
+    a kernel density is uniquely defined for a given kernel width
+    (here Gaussian kernels with standard deviation of $\sigma=2$).}
+\end{figure}
+
+To avoid this problem one can use so called \enterm {kernel densities}
+for estimating probability densities from data. Here every data point
+is replaced by a kernel (a function with integral one, like for
+example the Gaussian function) that is moved exactly to the position
+indicated by the data value. Then all the kernels of all the data
+values are summed up, the sum is divided by the number of data values,
+and we get an estimate of the probability density.
+
+As for the histogram, where we need to choose a bin width, we need to
+choose the width of the kernels appropriately.
+
+\newpage
+\begin{exercise}{gaussiankerneldensity.m}{}
+  Construct and plot a kernel density of the data from the previous
+  two exercises.
+\end{exercise}
+
+\subsection{Cumulative distributions}
+The \enterm{cumulative distribution function},
+\enterm[cdf|see{cumulative distribution function}]{cdf}, or
+\enterm[cumulative density function|see{cumulative distribution
+  function}]{cumulative density function}
+(\determ{kumulative Verteilung}) is the integral over the probability density
+up to any value $x$:
+\[ F(x) = \int_{-\infty}^x p(x') \, dx' \] 
+As such the cumulative distribution is a probability. It is the
+probability of getting a value smaller than $x$.
+
+For estimating the cumulative distribution from a set of data values
+we do not need to rely on histograms or kernel densities. Instead, it
+can be computed from the data directly without the need of a bin width
+or width of a kernel. For a data set of $N$ data values $x_i$ the
+probability of a data value smaller than $x$ is the number of data
+points with values smaller than $x$ divided by $N$. If we sort the
+data values than at each data value $x_i$ the number of data elements
+smaller than $x_i$ is increased by one and the corresponding
+probability of getting a value smaller than $x_i$ is increased by $1/N$.
+That is, the cumulative distribution is
+\[ F(x_i) = \frac{i}{N} \]
+See \figref{cumulativefig} for an example.
+
+The cumulative distribution tells you the fraction of data that are
+below a certain value and can therefore be used to evaluate significance
+from Null-hypothesis constructed from data, as it is done with bootstrap methods
+(see chapter \ref{bootstrapchapter}). The other way around the values of quartiles
+and percentiles can be determined from the inverse cumulative function.
+
+\begin{figure}[t]
+  \includegraphics[width=1\textwidth]{cumulative}
+  \titlecaption{\label{cumulativefig} Estimation of the cumulative
+    distribution.}{The cumulative distribution $F(x)$ estimated from
+    100 data values drawn from a normal distribution (red) in
+    comparison to the true cumulative distribution function computed
+    by numerically integrating the normal distribution function
+    (blue). From the cumulative distribution function one can read of
+    the probabilities of getting values smaller than a given value
+    (here: $P(x \ge -1) \approx 0.15$). From the inverse cumulative
+    distribution the position of percentiles can be computed (here:
+    the median (50\,\% percentile) is as expected close to zero and
+    the third quartile (75\,\% percentile) at $x=0.68$.}
+\end{figure}
+
+\begin{exercise}{cumulative.m}{cumulative.out}
+  Generate 200 normally distributed data values and construct an
+  estimate of the cumulative distribution function from this data.
+
+  Compare this estimate with an integral over the normal distribution.
+
+  Use the estimate to compute the probability of having data values
+  smaller than $-1$.
+
+  Use the estimate to compute the value of the 5\,\% percentile.
+\end{exercise}
+
+\newpage
 \section{Correlations}

 Until now we described properties of univariate data sets.  In
@ -353,7 +461,10 @@ data in a correlation coefficient close to zero

 \begin{figure}[tp]
  \includegraphics[width=1\textwidth]{correlation}
-  \titlecaption{\label{correlationfig} Korrelationen zwischen Datenpaaren.}{}
+  \titlecaption{\label{correlationfig} Correlations between pairs of
+    data.}{Shown are scatter plots of four data sets. Each point is a
+    single data pair. The correlation coefficient $r$ is given in the top
+    left of each plot.}
 \end{figure}

 \begin{exercise}{correlations.m}{}