added section on cumulative densities

2017-11-25 18:46:03 +01:00 · 2017-11-25 18:46:03 +01:00 · 6c95ec7256
commit 6c95ec7256
parent 12a417d6bc
13 changed files with 381 additions and 76 deletions
--- a/bootstrap/lecture/bootstrap.tex
+++ b/bootstrap/lecture/bootstrap.tex
@ -1,6 +1,7 @@
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \chapter{\tr{Bootstrap Methods}{Bootstrap Methoden}}
 \label{bootstrapchapter}
 Beim \determ{Bootstrap} erzeugt man sich die Verteilung von Statistiken durch Resampling
 aus der Stichprobe. Das hat mehrere Vorteile:
--- a/scientificcomputing-script.tex
+++ b/scientificcomputing-script.tex
@ -46,7 +46,7 @@
 \include{programmingstyle/lecture/programmingstyle}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\part{Grundlagen der Datenanalyse}
+\part{Data analysis}
 \graphicspath{{statistics/lecture/}{statistics/lecture/figures/}}
 \lstset{inputpath=statistics/code}
--- a/statistics/code/cumulative.m
+++ b/statistics/code/cumulative.m
@ -0,0 +1,18 @@
 x = randn(200, 1);                    % generate some data
 xs = sort(x);                         % sort the data
 cdf = [1:length(x)]/length(x);        % cumulative
 plot(xs, cdf);
 hold on;
 dx = 0.01;
 xx = [-4:dx:4];                     % x-values for Gaussian pdf
 gauss = exp(-0.5*xx.^2)/sqrt(2.0*pi);  % Gaussian pdf
 gausscdf = cumsum(gauss)*dx;
 plot(xx, gausscdf);
 hold off;
 printf('data : probability of x<-1: %.2f\n', cdf(xs<-1.0)(end))
 printf('gauss: probability of x<-1: %.2f\n', gausscdf(xx<-1.0)(end))
 printf('\n')
 printf('data : 5%% percentile at %.2f\n', xs(cdf<0.05)(end))
 printf('gauss: 5%% percentile at %.2f\n', xx(gausscdf<0.05)(end))
--- a/statistics/code/gaussianbins.m
+++ b/statistics/code/gaussianbins.m
@ -1,11 +1,11 @@
 x = randn(100, 1);   % generate some data
 db1 = 2;             % large bin width
 db2 = 0.5;           % small bin width
 bins1 = -4:db1:4;    % large bins
 bins2 = -4:db2:4;    % small bins
 [h1,b1] = hist(x, bins1);
 [h2,b2] = hist(x, bins2);
 bins1 = -4:2:4;      % large bins
 bins2 = -4:0.5:4;    % small bins
 [h1,b1] = hist(x,bins1);
 [h2,b2] = hist(x,bins2);
 subplot( 1, 2, 1 );
 bar(b1, h1)
 hold on 
 bar(b2, h2, 'facecolor', 'r' )
--- a/statistics/code/gaussianbinsnorm.m
+++ b/statistics/code/gaussianbinsnorm.m
@ -1,9 +1,9 @@
 hn1 = h1/sum(h1)/db1;
 hn2 = h2/sum(h2)/db2;
-subplot(  1, 2, 2 )
+
-bar(b1,hn1)
+bar(b1, hn1)
 hold on 
-bar(b2,hn2, 'facecolor', 'r' )
+bar(b2, hn2, 'facecolor', 'r' )
 xlabel('x')
 ylabel('Probability density')
 hold off
--- a/statistics/code/gaussiankerneldensity.m
+++ b/statistics/code/gaussiankerneldensity.m
@ -0,0 +1,42 @@
 data = randn(100, 1);            % generate some data
 sigma = 0.2;                     % standard deviation of Gaussian kernel
 xmin = -4.0;                     % minimum x value for kernel density
 xmax = 4.0;                      % maximum x value for kernel density
 dx = 0.05*sigma;                 % step size for kernel density
 xg = [-4.0*sigma:dx:4.0*sigma];  % x-axis for single Gaussian kernel  
 % single Gaussian kernel:
 kernel = exp(-0.5*(xg/sigma).^2)/sqrt(2.0*pi)/sigma;
 ng = (length(kernel)-1)/2;           % half the length of the Gaussian
 x = [xmin:dx:xmax+0.5*dx];       % x-axis for kernel density
 kd = zeros(1, length(x));        % vector for kernel density
 for i = 1:length(data)           % for every data value ...                
  xd = data(i);
  % index of data value in kernel density vector:
  inx = round((xd-xmin)/dx)+1;
  % start index for Gaussian in kernel density vector:
  k0 = inx-ng;
  % end index for Gaussian in kernel density vector:
  k1 = inx+ng;
  g0 = 1;                        % start index in Gaussian
  g1 = length(kernel);            % end index in Gaussian
  % check whether left side of Gaussian extends below xmin:
  if inx < ng+1
    % adjust start indices accordingly:
    k0 = 1;
    g0 = ng-inx+1;
  end
  % check whether right side of Gaussian extends above xmax:
  if inx > length(kd)-ng
    % adjust end indices accordingly:
    k1 = length(kd);
    g1 = length(kernel)-(inx+ng-length(kd));
  end
  % add Gaussian on kernel density:
  kd(k0:k1) = kd(k0:k1) + kernel(g0:g1);
 end
 kd /= length(data);               % normalize by number of data points
 % plot kernel density:
 plot(x, kd)
 xlabel('x')
 ylabel('Probability density')
--- a/statistics/lecture/cumulative.py
+++ b/statistics/lecture/cumulative.py
@ -0,0 +1,52 @@
 import numpy as np
 import matplotlib.pyplot as plt
 # data:
 rng = np.random.RandomState(981)
 data = rng.randn(100)
 xs = np.sort(data)
 cdf = np.arange(len(xs))/float(len(xs))
 # Gauss:
 dx = 0.01
 xx = np.arange(-4.0, 4.0, dx)
 gauss = np.exp(-0.5*xx*xx)/np.sqrt(2.0*np.pi)
 gausscdf = np.cumsum(gauss)*dx
 # plot:
 plt.xkcd()
 fig = plt.figure( figsize=(6, 2.6) )
 ax = fig.add_subplot(1, 1, 1)
 ax.spines['right'].set_visible(False)
 ax.spines['top'].set_visible(False)
 ax.yaxis.set_ticks_position('left')
 ax.xaxis.set_ticks_position('bottom')
 ax.set_xlabel( 'x' )
 ax.set_xlim(-3.2, 3.2)
 ax.set_xticks( np.arange( -3.0, 3.1, 1.0 ) )
 ax.set_ylabel( 'F(x)' )
 ax.set_ylim(-0.05, 1.05)
 ax.set_yticks( np.arange( 0.0, 1.1, 0.2 ) )
 med = xs[cdf>=0.5][0]
 ax.plot([-3.2, med, med], [0.5, 0.5, 0.0], 'k', lw=1, zorder=-5)
 ax.text(-2.8, 0.55, 'F=0.5')
 ax.text(0.15, 0.25, 'median at %.2f' % med)
 q3 = xs[cdf>=0.75][0]
 ax.plot([-3.2, q3, q3], [0.75, 0.75, 0.0], 'k', lw=1, zorder=-5)
 ax.text(-2.8, 0.8, 'F=0.75')
 ax.text(0.8, 0.5, '3. quartile at %.2f' % q3)
 p = cdf[xs>=-1.0][0]
 ax.plot([-3.2, -1.0, -1.0], [p, p, 0.0], 'k', lw=1, zorder=-5)
 ax.text(-2.8, 0.2, 'F=%.2f' % p)
 ax.text(-0.9, 0.05, '-1')
 ax.plot(xx, gausscdf, '-', color='#0000ff', lw=2, zorder=-1)
 ax.plot(xs, cdf, '-', color='#cc0000', lw=4, zorder=-1)
 ax.plot([-3.2, 3.2], [1.0, 1.0], '--', color='k', lw=2, zorder=-10)
 plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.35, hspace=0.3)
 fig.savefig( 'cumulative.pdf' )
 #plt.show()
--- a/statistics/lecture/diehistograms.py
+++ b/statistics/lecture/diehistograms.py
@ -34,6 +34,6 @@ ax.set_ylim(0, 0.23)
 ax.set_ylabel( 'Probability' )
 ax.plot([0.2, 6.8], [1.0/6.0, 1.0/6.0], '-b', lw=2, zorder=1)
 ax.hist([x2, x1], bins, normed=True, color=['#FFCC00', '#FFFF66' ], zorder=10)
-plt.tight_layout()
+plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.4, hspace=0.0)
 fig.savefig( 'diehistograms.pdf' )
 #plt.show()
--- a/statistics/lecture/kerneldensity.py
+++ b/statistics/lecture/kerneldensity.py
@ -0,0 +1,83 @@
 import numpy as np
 import matplotlib.pyplot as plt
 # normal distribution:
 rng = np.random.RandomState(6281)
 x = np.arange( -4.0, 4.0, 0.01 )
 g = np.exp(-0.5*x*x)/np.sqrt(2.0*np.pi)
 r = rng.randn(100)
 def kerneldensity(data, xmin, xmax, sigma=1.0) :
    dx = 0.05*sigma
    xg = np.arange(-4.0*sigma, 4.0*sigma + 0.5*dx, dx)
    gauss = np.exp(-0.5*xg*xg/sigma/sigma)/np.sqrt(2.0*np.pi)/sigma
    ng = len(gauss)/2
    x = np.arange(xmin, xmax+0.5*dx, dx)
    kd = np.zeros(len(x))
    for xd in data:
        inx = int((xd-xmin)/dx)
        k0 = inx-ng
        k1 = inx+ng+1
        g0 = 0
        g1 = len(gauss)
        if inx < ng:
            k0 = 0
            g0 = ng-inx
        if inx >= len(kd)-ng:
            k1 = len(kd)
            g1 = len(gauss)-(inx+ng-len(kd)+1)
        kd[k0:k1] += gauss[g0:g1]
    kd /= len(data)
    return kd, x
 plt.xkcd()
 fig = plt.figure( figsize=(6,3) )
 ax = fig.add_subplot(2, 2, 1)
 ax.spines['right'].set_visible(False)
 ax.spines['top'].set_visible(False)
 ax.yaxis.set_ticks_position('left')
 ax.xaxis.set_ticks_position('bottom')
 ax.set_xlabel( 'x' )
 ax.set_xlim(-3.2, 3.2)
 ax.set_xticks( np.arange( -3.0, 3.1, 1.0 ) )
 ax.set_ylabel( 'p(x)' )
 ax.set_ylim(0.0, 0.49)
 ax.set_yticks( np.arange( 0.0, 0.41, 0.1 ) )
 #ax.plot(x, g, '-b', lw=2, zorder=-1)
 ax.hist(r, np.arange(-4.1, 4, 0.4), normed=True, color='#FFCC00', zorder=-5)
 ax = fig.add_subplot(2, 2, 3)
 ax.spines['right'].set_visible(False)
 ax.spines['top'].set_visible(False)
 ax.yaxis.set_ticks_position('left')
 ax.xaxis.set_ticks_position('bottom')
 ax.set_xlabel( 'x' )
 ax.set_xlim(-3.2, 3.2)
 ax.set_xticks( np.arange( -3.0, 3.1, 1.0 ) )
 ax.set_ylabel( 'p(x)' )
 ax.set_ylim(0.0, 0.49)
 ax.set_yticks( np.arange( 0.0, 0.41, 0.1 ) )
 #ax.plot(x, g, '-b', lw=2, zorder=-1)
 ax.hist(r, np.arange(-4.3, 4, 0.4), normed=True, color='#FFCC00', zorder=-5)
 ax = fig.add_subplot(1, 2, 2)
 ax.spines['right'].set_visible(False)
 ax.spines['top'].set_visible(False)
 ax.yaxis.set_ticks_position('left')
 ax.xaxis.set_ticks_position('bottom')
 ax.set_xlabel( 'x' )
 ax.set_xlim(-3.2, 3.2)
 ax.set_xticks( np.arange( -3.0, 3.1, 1.0 ) )
 ax.set_ylabel( 'Probab. density p(x)' )
 ax.set_ylim(0.0, 0.49)
 ax.set_yticks( np.arange( 0.0, 0.41, 0.1 ) )
 kd, xx = kerneldensity(r, -3.2, 3.2, 0.2)
 ax.fill_between(xx, 0.0, kd, color='#FF9900', zorder=-5)
 ax.plot(xx, kd, '-', lw=3, color='#CC0000', zorder=-1)
 plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.35, hspace=0.3)
 fig.savefig( 'kerneldensity.pdf' )
 #plt.show()
--- a/statistics/lecture/pdfhistogram.py
+++ b/statistics/lecture/pdfhistogram.py
@ -38,7 +38,7 @@ ax.plot(x, g, '-b', lw=2, zorder=-1)
 ax.hist(r, 5, normed=True, color='#CC0000', zorder=-10)
 ax.hist(r, 20, normed=True, color='#FFCC00', zorder=-5)
-plt.tight_layout()
+plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.4, hspace=0.0)
 fig.savefig( 'pdfhistogram.pdf' )
 #plt.show()
--- a/statistics/lecture/quartile.py
+++ b/statistics/lecture/quartile.py
@ -7,7 +7,7 @@ g = np.exp(-0.5*x*x)/np.sqrt(2.0*np.pi)
 q = [ -0.67488, 0.0, 0.67488 ]
 plt.xkcd()
-fig = plt.figure( figsize=(6,3.4) )
+fig = plt.figure( figsize=(6,3.2) )
 ax = fig.add_subplot( 1, 1, 1 )
 ax.spines['right'].set_visible(False)
 ax.spines['top'].set_visible(False)
@ -44,6 +44,7 @@ ax.plot(x,g, 'b', lw=4)
 ax.plot([0.0, 0.0], [0.0, 0.45], 'k', lw=2 )
 ax.plot([q[0], q[0]], [0.0, 0.4], 'k', lw=2 )
 ax.plot([q[2], q[2]], [0.0, 0.4], 'k', lw=2 )
-plt.tight_layout()
+plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.4, hspace=0.0)
 #plt.tight_layout()
 fig.savefig( 'quartile.pdf' )
 #plt.show()
--- a/statistics/lecture/statistics-chapter.tex
+++ b/statistics/lecture/statistics-chapter.tex
@ -19,9 +19,6 @@
 \section{TODO}
 \begin{itemize}
 \item Replace exercise 1.3 (boxwhisker) by one recreating figure 1.
 \item Proper introduction to probabilities and densities first!
 \item Cumulative propability
 \item Kernel Histogramms (important for convolved PSTH)!
 \end{itemize}
 \end{document}
--- a/statistics/lecture/statistics.tex
+++ b/statistics/lecture/statistics.tex
@ -80,12 +80,12 @@ used to illustrate the standard deviation of the data
 \begin{figure}[t]
  \includegraphics[width=1\textwidth]{median}
  \titlecaption{\label{medianfig} Median, mean and mode of a
-    probability distribution.}{Left: Median, mean and mode are
+    probability distribution.}{Left: Median, mean and mode coincide
-    identical for the symmetric and unimodal normal distribution.
+    for the symmetric and unimodal normal distribution.  Right: for
-    Right: for asymmetric distributions these three measures differ. A
+    asymmetric distributions these three measures differ. A heavy tail
-    heavy tail of a distribution pulls out the mean most strongly. In
+    of a distribution pulls out the mean most strongly. In contrast,
-    contrast, the median is more robust against heavy tails, but not
+    the median is more robust against heavy tails, but not necessarily
-    necessarily identical with the mode.}
+    identical with the mode.}
 \end{figure}
 The \enterm{mode} is the most frequent value, i.e. the position of the maximum of the probability distribution.
@ -113,7 +113,10 @@ not smaller than the median (\figref{medianfig}).
 \begin{figure}[t]
  \includegraphics[width=1\textwidth]{quartile}
-  \titlecaption{\label{quartilefig} Median and quartiles of a normal distribution.}{}
+  \titlecaption{\label{quartilefig} Median and quartiles of a normal
    distribution.}{ The interquartile range between the first and the
    third quartile contains 50\,\% of the data and contains the
    median.}
 \end{figure}
 The distribution of data can be further characterized by the position
@ -164,7 +167,9 @@ The distribution of values in a data set is estimated by histograms
 $N=\sum_{i=1}^M n_i$ measurements in each of $M$ bins $i$
 (\figref{diehistogramsfig} left).  The bins tile the data range
 usually into intervals of the same size. The width of the bins is
-called the bin width.
+called the bin width. The frequencies $n_i$ plotted against the
 categories $i$ is the \enterm{histogram}, or the \enterm{frequency
  histogram}.
 \begin{figure}[t]
  \includegraphics[width=1\textwidth]{diehistograms}
@ -219,7 +224,7 @@ category $i$, i.e. of getting a data value in the $i$-th bin.
 \subsection{Probability densities functions}
 In cases where we deal with data sets of measurements of a real
-quantity (e.g. the length of snakes, the weight of elephants, the time
+quantity (e.g. lengths of snakes, weights of elephants, times
 between succeeding spikes) there is no natural bin width for computing
 a histogram. In addition, the probability of measuring a data value that
 equals exactly a specific real number like, e.g., 0.123456789 is zero, because
@ -230,7 +235,7 @@ range.  For example, we can ask for the probability $P(1.2<x<1.3)$ to
 get a measurement between 0 and 1 (\figref{pdfprobabilitiesfig}). More
 generally, we want to know the probability $P(x_0<x<x_1)$ to obtain a
 measurement between $x_0$ and $x_1$. If we define the width of the
-range defined by $x_0$ and $x_1$ is $\Delta x = x_1 - x_0$ then the
+range between $x_0$ and $x_1$ as $\Delta x = x_1 - x_0$ then the
 probability can also be expressed as $P(x_0<x<x_0 + \Delta x)$.
 In the limit to very small ranges $\Delta x$ the probability of
@ -238,44 +243,45 @@ getting a measurement between $x_0$ and $x_0+\Delta x$ scales down to
 zero with $\Delta x$:
 \[ P(x_0<x<x_0+\Delta x) \approx p(x_0) \cdot \Delta x \; . \] 
 In here the quantity $p(x_00)$ is a so called \enterm{probability
-  density}. This is not a unitless probability with values between 0
+  density} that is larger than zero and that described the
-and 1, but a number that takes on any positive real number and has as
+distribution of the data values. The probability density is not a
-a unit the inverse of the unit of the data values --- hence the name
+unitless probability with values between 0 and 1, but a number that
-``density''.
+takes on any positive real number and has as a unit the inverse of the
 unit of the data values --- hence the name ``density''.
 \begin{figure}[t]
  \includegraphics[width=1\textwidth]{pdfprobabilities}
  \titlecaption{\label{pdfprobabilitiesfig} Probability of a
    probability density.}{The probability of a data value $x$ between,
-    e.g., zero and one is the integral (red area) over the probability
+    e.g., zero and one is the integral (red area) of the probability
    density (blue).}
 \end{figure}
 The probability to get a value $x$ between $x_1$ and $x_2$ is 
-given by the integral over the probability density:
+given by the integral of the probability density:
 \[ P(x_1 < x < x2) = \int\limits_{x_1}^{x_2} p(x) \, dx \; . \]
-Because the probability to get any value $x$ is one, the integral over
+Because the probability to get any value $x$ is one, the integral of
-the probability density 
+the probability density over the whole real axis must be one:
 Da die Wahrscheinlichkeit irgendeines Wertes $x$ Eins ergeben muss gilt die Normierung
 \begin{equation}
  \label{pdfnorm}
  P(-\infty < x < \infty) = \int\limits_{-\infty}^{+\infty} p(x) \, dx = 1 \; .
 \end{equation}
-\pagebreak[2]
+The function $p(x)$, that assigns to every $x$ a probability density,
-Die gesamte Funktion $p(x)$, die jedem Wert $x$ einen
+is called \enterm{probability density function},
-Wahrscheinlichkeitsdichte zuordnet wir auch
+\enterm[pdf|see{probability density function}]{pdf}, or just
-\determ{Wahrscheinlichkeitsdichtefunktion} (\enterm{probability
+\enterm[density|see{probability density function}]{density}
-  density function}, \enterm[pdf|see{probability density
+(\determ{Wahrscheinlichkeitsdichtefunktion}). The well known
-  function}]{pdf}, oder kurz \enterm[density|see{probability density
+\enterm{normal distribution} (\determ{Normalverteilung}) is an example of a
-  function}]{density}) genannt. Die bekannteste
+probability density function
-Wahrscheinlichkeitsdichtefunktion ist die der \determ{Normalverteilung}
+\[ p_g(x) = \frac{1}{\sqrt{2\pi\sigma^2}}e^{-\frac{(x-\mu)^2}{2\sigma^2}} \]
-\[ p_g(x) =
+--- the \enterm{Guassian distribution}
-\frac{1}{\sqrt{2\pi\sigma^2}}e^{-\frac{(x-\mu)^2}{2\sigma^2}} \]
+(\determ{Gau{\ss}sche-Glockenkurve}) with mean $\mu$ and standard
--- die \determ{Gau{\ss}sche-Glockenkurve} mit Mittelwert $\mu$ und
+deviation $\sigma$.
-Standardabweichung $\sigma$.
+The factor in front of the exponential function ensures the normalization to
 $\int p_g(x) \, dx = 1$, \eqnref{pdfnorm}.
 \newpage
 \begin{exercise}{gaussianpdf.m}{gaussianpdf.out}
  \begin{enumerate}
  \item Plot the probability density of the normal distribution $p_g(x)$.
@ -288,6 +294,38 @@ Standardabweichung $\sigma$.
  \end{enumerate}
 \end{exercise}
 \newpage
 Histograms of real valued data depend on both the number of data
 values and the chosen bin width. As in the example with the die
 (\figref{diehistogramsfig} left), the height of the histogram gets
 larger the larger the size of the data set. Also, as the bin width is
 increased the hight of the histogram increases, because more data
 values fall within each bin (\figref{pdfhistogramfig} left).
 \begin{exercise}{gaussianbins.m}{}
  Draw 100 random data from a Gaussian distribution and plot
  histograms with different bin sizes of the data. What do you
  observe?
 \end{exercise}
 To turn such histograms to estimates of probability densities they
 need to be normalized such that according to \eqnref{pdfnorm} their
 integral equals one. While histograms of categorial data are
 normalized such that their sum equals one, here we need to integrate
 over the histogram. The integral is the area (not the height) of the
 histogram bars. Each bar has the height $n_i$ and the width $\Delta
 x$.  The total area $A$ of the histogram is thus
 \[ A = \sum_{i=1}^N ( n_i \cdot \Delta x ) = \Delta x \sum_{i=1}^N n_i = N \, \Delta x \]
 and the normalized histogram has the heights
 \[ p(x_i) = \frac{n_i}{\Delta x \sum_{i=1}^N n_i} = \frac{n_i}{N
  \Delta x} \; .\] 
 A histogram needs to be divided by both the sum of the frequencies
 $n_i$ and the bin width $\Delta x$ to results in an estimate of the
 corresponding probability density. Only then can the distribution be
 compared with other distributions and in particular with theoretical
 probability density functions like the one of the normal distribution
 (\figref{pdfhistogramfig} right).
 \begin{figure}[t]
  \includegraphics[width=1\textwidth]{pdfhistogram}
  \titlecaption{\label{pdfhistogramfig} Histograms with different bin
@ -300,36 +338,106 @@ Standardabweichung $\sigma$.
    normal distributions (blue).}
 \end{figure}
-\pagebreak[4]
+\newpage
 \begin{exercise}{gaussianbins.m}{}
  Draw 100 random data from a Gaussian distribution and plot
  histograms with different bin sizes of the data. What do you
  observe?
 \end{exercise}
 Damit Histogramme von reellen Messwerten trotz unterschiedlicher
 Anzahl von Messungen und unterschiedlicher Klassenbreiten
 untereinander vergleichbar werden und mit bekannten
 Wahrscheinlichkeitsdichtefunktionen verglichen werden k\"onnen,
 m\"ussen sie auf das Integral Eins normiert werden
 \eqnref{pdfnorm}. Das Integral (nicht die Summe) \"uber das Histogramm
 soll Eins ergeben --- denn die Wahrscheinlichkeit, dass irgendeiner
 der Messwerte auftritt mu{\ss} Eins sein. Das Integral ist die
 Fl\"ache des Histogramms, die sich aus der Fl\"ache der einzelnen
 Histogrammbalken zusammen setzt. Die Balken des Histogramms haben die
 H\"ohe $n_i$ und die Breite $\Delta x$. Die Gesamtfl\"ache $A$ des
 Histogramms ist also
 \[ A = \sum_{i=1}^N ( n_i \cdot \Delta x ) = \Delta x \sum_{i=1}^N n_i \]
 und das normierte Histogramm hat die H\"ohe
 \[ p(x_i) = \frac{n_i}{\Delta x \sum_{i=1}^N n_i} \] 
 Es muss also nicht nur durch die Summe, sondern auch durch die Breite
 $\Delta x$ der Klassen geteilt werden (\figref{pdfhistogramfig}).
 \begin{exercise}{gaussianbinsnorm.m}{}
-  Normiere das Histogramm der vorherigen \"Ubung zu einer Wahrscheinlichkeitsdichte.
+  Normalize the histogram of the previous exercise to a probability density.
 \end{exercise}
 \newpage
 \subsection{Kernel densities}
 A problem of using histograms for estimating probability densities is
 that the have hard bin edges. Depending on where the bin edges are placed
 a data value falls in one or the other bin. 
 \begin{figure}[t]
  \includegraphics[width=1\textwidth]{kerneldensity}
  \titlecaption{\label{kerneldensityfig} Kernel densities.}{Left: The
    histogram-based estimation of the probability density is dependent
    also on the position of the bins. In the bottom plot the bins have
    bin shifted by half a bin width (here $\Delta x=0.4$) and as a
    result details of the probability density look different. Look,
    for example at the height of the largest bin. Right: In contrast,
    a kernel density is uniquely defined for a given kernel width
    (here Gaussian kernels with standard deviation of $\sigma=2$).}
 \end{figure}
 To avoid this problem one can use so called \enterm {kernel densities}
 for estimating probability densities from data. Here every data point
 is replaced by a kernel (a function with integral one, like for
 example the Gaussian function) that is moved exactly to the position
 indicated by the data value. Then all the kernels of all the data
 values are summed up, the sum is divided by the number of data values,
 and we get an estimate of the probability density.
 As for the histogram, where we need to choose a bin width, we need to
 choose the width of the kernels appropriately.
 \newpage
 \begin{exercise}{gaussiankerneldensity.m}{}
  Construct and plot a kernel density of the data from the previous
  two exercises.
 \end{exercise}
 \subsection{Cumulative distributions}
 The \enterm{cumulative distribution function},
 \enterm[cdf|see{cumulative distribution function}]{cdf}, or
 \enterm[cumulative density function|see{cumulative distribution
  function}]{cumulative density function}
 (\determ{kumulative Verteilung}) is the integral over the probability density
 up to any value $x$:
 \[ F(x) = \int_{-\infty}^x p(x') \, dx' \] 
 As such the cumulative distribution is a probability. It is the
 probability of getting a value smaller than $x$.
 For estimating the cumulative distribution from a set of data values
 we do not need to rely on histograms or kernel densities. Instead, it
 can be computed from the data directly without the need of a bin width
 or width of a kernel. For a data set of $N$ data values $x_i$ the
 probability of a data value smaller than $x$ is the number of data
 points with values smaller than $x$ divided by $N$. If we sort the
 data values than at each data value $x_i$ the number of data elements
 smaller than $x_i$ is increased by one and the corresponding
 probability of getting a value smaller than $x_i$ is increased by $1/N$.
 That is, the cumulative distribution is
 \[ F(x_i) = \frac{i}{N} \]
 See \figref{cumulativefig} for an example.
 The cumulative distribution tells you the fraction of data that are
 below a certain value and can therefore be used to evaluate significance
 from Null-hypothesis constructed from data, as it is done with bootstrap methods
 (see chapter \ref{bootstrapchapter}). The other way around the values of quartiles
 and percentiles can be determined from the inverse cumulative function.
 \begin{figure}[t]
  \includegraphics[width=1\textwidth]{cumulative}
  \titlecaption{\label{cumulativefig} Estimation of the cumulative
    distribution.}{The cumulative distribution $F(x)$ estimated from
    100 data values drawn from a normal distribution (red) in
    comparison to the true cumulative distribution function computed
    by numerically integrating the normal distribution function
    (blue). From the cumulative distribution function one can read of
    the probabilities of getting values smaller than a given value
    (here: $P(x \ge -1) \approx 0.15$). From the inverse cumulative
    distribution the position of percentiles can be computed (here:
    the median (50\,\% percentile) is as expected close to zero and
    the third quartile (75\,\% percentile) at $x=0.68$.}
 \end{figure}
 \begin{exercise}{cumulative.m}{cumulative.out}
  Generate 200 normally distributed data values and construct an
  estimate of the cumulative distribution function from this data.
  Compare this estimate with an integral over the normal distribution.
  Use the estimate to compute the probability of having data values
  smaller than $-1$.
  Use the estimate to compute the value of the 5\,\% percentile.
 \end{exercise}
 \newpage
 \section{Correlations}
 Until now we described properties of univariate data sets.  In
@ -353,7 +461,10 @@ data in a correlation coefficient close to zero
 \begin{figure}[tp]
  \includegraphics[width=1\textwidth]{correlation}
-  \titlecaption{\label{correlationfig} Korrelationen zwischen Datenpaaren.}{}
+  \titlecaption{\label{correlationfig} Correlations between pairs of
    data.}{Shown are scatter plots of four data sets. Each point is a
    single data pair. The correlation coefficient $r$ is given in the top
    left of each plot.}
 \end{figure}
 \begin{exercise}{correlations.m}{}