added section on cumulative densities

This commit is contained in:
Jan Benda 2017-11-25 18:46:03 +01:00
parent 12a417d6bc
commit 6c95ec7256
13 changed files with 380 additions and 75 deletions

View File

@ -1,6 +1,7 @@
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{\tr{Bootstrap Methods}{Bootstrap Methoden}}
\label{bootstrapchapter}
Beim \determ{Bootstrap} erzeugt man sich die Verteilung von Statistiken durch Resampling
aus der Stichprobe. Das hat mehrere Vorteile:

View File

@ -46,7 +46,7 @@
\include{programmingstyle/lecture/programmingstyle}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\part{Grundlagen der Datenanalyse}
\part{Data analysis}
\graphicspath{{statistics/lecture/}{statistics/lecture/figures/}}
\lstset{inputpath=statistics/code}

View File

@ -0,0 +1,18 @@
x = randn(200, 1); % generate some data
xs = sort(x); % sort the data
cdf = [1:length(x)]/length(x); % cumulative
plot(xs, cdf);
hold on;
dx = 0.01;
xx = [-4:dx:4]; % x-values for Gaussian pdf
gauss = exp(-0.5*xx.^2)/sqrt(2.0*pi); % Gaussian pdf
gausscdf = cumsum(gauss)*dx;
plot(xx, gausscdf);
hold off;
printf('data : probability of x<-1: %.2f\n', cdf(xs<-1.0)(end))
printf('gauss: probability of x<-1: %.2f\n', gausscdf(xx<-1.0)(end))
printf('\n')
printf('data : 5%% percentile at %.2f\n', xs(cdf<0.05)(end))
printf('gauss: 5%% percentile at %.2f\n', xx(gausscdf<0.05)(end))

View File

@ -1,11 +1,11 @@
x = randn(100, 1); % generate some data
bins1 = -4:2:4; % large bins
bins2 = -4:0.5:4; % small bins
db1 = 2; % large bin width
db2 = 0.5; % small bin width
bins1 = -4:db1:4; % large bins
bins2 = -4:db2:4; % small bins
[h1,b1] = hist(x, bins1);
[h2,b2] = hist(x, bins2);
subplot( 1, 2, 1 );
bar(b1, h1)
hold on
bar(b2, h2, 'facecolor', 'r' )

View File

@ -1,6 +1,6 @@
hn1 = h1/sum(h1)/db1;
hn2 = h2/sum(h2)/db2;
subplot( 1, 2, 2 )
bar(b1, hn1)
hold on
bar(b2, hn2, 'facecolor', 'r' )

View File

@ -0,0 +1,42 @@
data = randn(100, 1); % generate some data
sigma = 0.2; % standard deviation of Gaussian kernel
xmin = -4.0; % minimum x value for kernel density
xmax = 4.0; % maximum x value for kernel density
dx = 0.05*sigma; % step size for kernel density
xg = [-4.0*sigma:dx:4.0*sigma]; % x-axis for single Gaussian kernel
% single Gaussian kernel:
kernel = exp(-0.5*(xg/sigma).^2)/sqrt(2.0*pi)/sigma;
ng = (length(kernel)-1)/2; % half the length of the Gaussian
x = [xmin:dx:xmax+0.5*dx]; % x-axis for kernel density
kd = zeros(1, length(x)); % vector for kernel density
for i = 1:length(data) % for every data value ...
xd = data(i);
% index of data value in kernel density vector:
inx = round((xd-xmin)/dx)+1;
% start index for Gaussian in kernel density vector:
k0 = inx-ng;
% end index for Gaussian in kernel density vector:
k1 = inx+ng;
g0 = 1; % start index in Gaussian
g1 = length(kernel); % end index in Gaussian
% check whether left side of Gaussian extends below xmin:
if inx < ng+1
% adjust start indices accordingly:
k0 = 1;
g0 = ng-inx+1;
end
% check whether right side of Gaussian extends above xmax:
if inx > length(kd)-ng
% adjust end indices accordingly:
k1 = length(kd);
g1 = length(kernel)-(inx+ng-length(kd));
end
% add Gaussian on kernel density:
kd(k0:k1) = kd(k0:k1) + kernel(g0:g1);
end
kd /= length(data); % normalize by number of data points
% plot kernel density:
plot(x, kd)
xlabel('x')
ylabel('Probability density')

View File

@ -0,0 +1,52 @@
import numpy as np
import matplotlib.pyplot as plt
# data:
rng = np.random.RandomState(981)
data = rng.randn(100)
xs = np.sort(data)
cdf = np.arange(len(xs))/float(len(xs))
# Gauss:
dx = 0.01
xx = np.arange(-4.0, 4.0, dx)
gauss = np.exp(-0.5*xx*xx)/np.sqrt(2.0*np.pi)
gausscdf = np.cumsum(gauss)*dx
# plot:
plt.xkcd()
fig = plt.figure( figsize=(6, 2.6) )
ax = fig.add_subplot(1, 1, 1)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom')
ax.set_xlabel( 'x' )
ax.set_xlim(-3.2, 3.2)
ax.set_xticks( np.arange( -3.0, 3.1, 1.0 ) )
ax.set_ylabel( 'F(x)' )
ax.set_ylim(-0.05, 1.05)
ax.set_yticks( np.arange( 0.0, 1.1, 0.2 ) )
med = xs[cdf>=0.5][0]
ax.plot([-3.2, med, med], [0.5, 0.5, 0.0], 'k', lw=1, zorder=-5)
ax.text(-2.8, 0.55, 'F=0.5')
ax.text(0.15, 0.25, 'median at %.2f' % med)
q3 = xs[cdf>=0.75][0]
ax.plot([-3.2, q3, q3], [0.75, 0.75, 0.0], 'k', lw=1, zorder=-5)
ax.text(-2.8, 0.8, 'F=0.75')
ax.text(0.8, 0.5, '3. quartile at %.2f' % q3)
p = cdf[xs>=-1.0][0]
ax.plot([-3.2, -1.0, -1.0], [p, p, 0.0], 'k', lw=1, zorder=-5)
ax.text(-2.8, 0.2, 'F=%.2f' % p)
ax.text(-0.9, 0.05, '-1')
ax.plot(xx, gausscdf, '-', color='#0000ff', lw=2, zorder=-1)
ax.plot(xs, cdf, '-', color='#cc0000', lw=4, zorder=-1)
ax.plot([-3.2, 3.2], [1.0, 1.0], '--', color='k', lw=2, zorder=-10)
plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.35, hspace=0.3)
fig.savefig( 'cumulative.pdf' )
#plt.show()

View File

@ -34,6 +34,6 @@ ax.set_ylim(0, 0.23)
ax.set_ylabel( 'Probability' )
ax.plot([0.2, 6.8], [1.0/6.0, 1.0/6.0], '-b', lw=2, zorder=1)
ax.hist([x2, x1], bins, normed=True, color=['#FFCC00', '#FFFF66' ], zorder=10)
plt.tight_layout()
plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.4, hspace=0.0)
fig.savefig( 'diehistograms.pdf' )
#plt.show()

View File

@ -0,0 +1,83 @@
import numpy as np
import matplotlib.pyplot as plt
# normal distribution:
rng = np.random.RandomState(6281)
x = np.arange( -4.0, 4.0, 0.01 )
g = np.exp(-0.5*x*x)/np.sqrt(2.0*np.pi)
r = rng.randn(100)
def kerneldensity(data, xmin, xmax, sigma=1.0) :
dx = 0.05*sigma
xg = np.arange(-4.0*sigma, 4.0*sigma + 0.5*dx, dx)
gauss = np.exp(-0.5*xg*xg/sigma/sigma)/np.sqrt(2.0*np.pi)/sigma
ng = len(gauss)/2
x = np.arange(xmin, xmax+0.5*dx, dx)
kd = np.zeros(len(x))
for xd in data:
inx = int((xd-xmin)/dx)
k0 = inx-ng
k1 = inx+ng+1
g0 = 0
g1 = len(gauss)
if inx < ng:
k0 = 0
g0 = ng-inx
if inx >= len(kd)-ng:
k1 = len(kd)
g1 = len(gauss)-(inx+ng-len(kd)+1)
kd[k0:k1] += gauss[g0:g1]
kd /= len(data)
return kd, x
plt.xkcd()
fig = plt.figure( figsize=(6,3) )
ax = fig.add_subplot(2, 2, 1)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom')
ax.set_xlabel( 'x' )
ax.set_xlim(-3.2, 3.2)
ax.set_xticks( np.arange( -3.0, 3.1, 1.0 ) )
ax.set_ylabel( 'p(x)' )
ax.set_ylim(0.0, 0.49)
ax.set_yticks( np.arange( 0.0, 0.41, 0.1 ) )
#ax.plot(x, g, '-b', lw=2, zorder=-1)
ax.hist(r, np.arange(-4.1, 4, 0.4), normed=True, color='#FFCC00', zorder=-5)
ax = fig.add_subplot(2, 2, 3)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom')
ax.set_xlabel( 'x' )
ax.set_xlim(-3.2, 3.2)
ax.set_xticks( np.arange( -3.0, 3.1, 1.0 ) )
ax.set_ylabel( 'p(x)' )
ax.set_ylim(0.0, 0.49)
ax.set_yticks( np.arange( 0.0, 0.41, 0.1 ) )
#ax.plot(x, g, '-b', lw=2, zorder=-1)
ax.hist(r, np.arange(-4.3, 4, 0.4), normed=True, color='#FFCC00', zorder=-5)
ax = fig.add_subplot(1, 2, 2)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_ticks_position('bottom')
ax.set_xlabel( 'x' )
ax.set_xlim(-3.2, 3.2)
ax.set_xticks( np.arange( -3.0, 3.1, 1.0 ) )
ax.set_ylabel( 'Probab. density p(x)' )
ax.set_ylim(0.0, 0.49)
ax.set_yticks( np.arange( 0.0, 0.41, 0.1 ) )
kd, xx = kerneldensity(r, -3.2, 3.2, 0.2)
ax.fill_between(xx, 0.0, kd, color='#FF9900', zorder=-5)
ax.plot(xx, kd, '-', lw=3, color='#CC0000', zorder=-1)
plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.35, hspace=0.3)
fig.savefig( 'kerneldensity.pdf' )
#plt.show()

View File

@ -38,7 +38,7 @@ ax.plot(x, g, '-b', lw=2, zorder=-1)
ax.hist(r, 5, normed=True, color='#CC0000', zorder=-10)
ax.hist(r, 20, normed=True, color='#FFCC00', zorder=-5)
plt.tight_layout()
plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.4, hspace=0.0)
fig.savefig( 'pdfhistogram.pdf' )
#plt.show()

View File

@ -7,7 +7,7 @@ g = np.exp(-0.5*x*x)/np.sqrt(2.0*np.pi)
q = [ -0.67488, 0.0, 0.67488 ]
plt.xkcd()
fig = plt.figure( figsize=(6,3.4) )
fig = plt.figure( figsize=(6,3.2) )
ax = fig.add_subplot( 1, 1, 1 )
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
@ -44,6 +44,7 @@ ax.plot(x,g, 'b', lw=4)
ax.plot([0.0, 0.0], [0.0, 0.45], 'k', lw=2 )
ax.plot([q[0], q[0]], [0.0, 0.4], 'k', lw=2 )
ax.plot([q[2], q[2]], [0.0, 0.4], 'k', lw=2 )
plt.tight_layout()
plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.4, hspace=0.0)
#plt.tight_layout()
fig.savefig( 'quartile.pdf' )
#plt.show()

View File

@ -19,9 +19,6 @@
\section{TODO}
\begin{itemize}
\item Replace exercise 1.3 (boxwhisker) by one recreating figure 1.
\item Proper introduction to probabilities and densities first!
\item Cumulative propability
\item Kernel Histogramms (important for convolved PSTH)!
\end{itemize}
\end{document}

View File

@ -80,12 +80,12 @@ used to illustrate the standard deviation of the data
\begin{figure}[t]
\includegraphics[width=1\textwidth]{median}
\titlecaption{\label{medianfig} Median, mean and mode of a
probability distribution.}{Left: Median, mean and mode are
identical for the symmetric and unimodal normal distribution.
Right: for asymmetric distributions these three measures differ. A
heavy tail of a distribution pulls out the mean most strongly. In
contrast, the median is more robust against heavy tails, but not
necessarily identical with the mode.}
probability distribution.}{Left: Median, mean and mode coincide
for the symmetric and unimodal normal distribution. Right: for
asymmetric distributions these three measures differ. A heavy tail
of a distribution pulls out the mean most strongly. In contrast,
the median is more robust against heavy tails, but not necessarily
identical with the mode.}
\end{figure}
The \enterm{mode} is the most frequent value, i.e. the position of the maximum of the probability distribution.
@ -113,7 +113,10 @@ not smaller than the median (\figref{medianfig}).
\begin{figure}[t]
\includegraphics[width=1\textwidth]{quartile}
\titlecaption{\label{quartilefig} Median and quartiles of a normal distribution.}{}
\titlecaption{\label{quartilefig} Median and quartiles of a normal
distribution.}{ The interquartile range between the first and the
third quartile contains 50\,\% of the data and contains the
median.}
\end{figure}
The distribution of data can be further characterized by the position
@ -164,7 +167,9 @@ The distribution of values in a data set is estimated by histograms
$N=\sum_{i=1}^M n_i$ measurements in each of $M$ bins $i$
(\figref{diehistogramsfig} left). The bins tile the data range
usually into intervals of the same size. The width of the bins is
called the bin width.
called the bin width. The frequencies $n_i$ plotted against the
categories $i$ is the \enterm{histogram}, or the \enterm{frequency
histogram}.
\begin{figure}[t]
\includegraphics[width=1\textwidth]{diehistograms}
@ -219,7 +224,7 @@ category $i$, i.e. of getting a data value in the $i$-th bin.
\subsection{Probability densities functions}
In cases where we deal with data sets of measurements of a real
quantity (e.g. the length of snakes, the weight of elephants, the time
quantity (e.g. lengths of snakes, weights of elephants, times
between succeeding spikes) there is no natural bin width for computing
a histogram. In addition, the probability of measuring a data value that
equals exactly a specific real number like, e.g., 0.123456789 is zero, because
@ -230,7 +235,7 @@ range. For example, we can ask for the probability $P(1.2<x<1.3)$ to
get a measurement between 0 and 1 (\figref{pdfprobabilitiesfig}). More
generally, we want to know the probability $P(x_0<x<x_1)$ to obtain a
measurement between $x_0$ and $x_1$. If we define the width of the
range defined by $x_0$ and $x_1$ is $\Delta x = x_1 - x_0$ then the
range between $x_0$ and $x_1$ as $\Delta x = x_1 - x_0$ then the
probability can also be expressed as $P(x_0<x<x_0 + \Delta x)$.
In the limit to very small ranges $\Delta x$ the probability of
@ -238,44 +243,45 @@ getting a measurement between $x_0$ and $x_0+\Delta x$ scales down to
zero with $\Delta x$:
\[ P(x_0<x<x_0+\Delta x) \approx p(x_0) \cdot \Delta x \; . \]
In here the quantity $p(x_00)$ is a so called \enterm{probability
density}. This is not a unitless probability with values between 0
and 1, but a number that takes on any positive real number and has as
a unit the inverse of the unit of the data values --- hence the name
``density''.
density} that is larger than zero and that described the
distribution of the data values. The probability density is not a
unitless probability with values between 0 and 1, but a number that
takes on any positive real number and has as a unit the inverse of the
unit of the data values --- hence the name ``density''.
\begin{figure}[t]
\includegraphics[width=1\textwidth]{pdfprobabilities}
\titlecaption{\label{pdfprobabilitiesfig} Probability of a
probability density.}{The probability of a data value $x$ between,
e.g., zero and one is the integral (red area) over the probability
e.g., zero and one is the integral (red area) of the probability
density (blue).}
\end{figure}
The probability to get a value $x$ between $x_1$ and $x_2$ is
given by the integral over the probability density:
given by the integral of the probability density:
\[ P(x_1 < x < x2) = \int\limits_{x_1}^{x_2} p(x) \, dx \; . \]
Because the probability to get any value $x$ is one, the integral over
the probability density
Da die Wahrscheinlichkeit irgendeines Wertes $x$ Eins ergeben muss gilt die Normierung
Because the probability to get any value $x$ is one, the integral of
the probability density over the whole real axis must be one:
\begin{equation}
\label{pdfnorm}
P(-\infty < x < \infty) = \int\limits_{-\infty}^{+\infty} p(x) \, dx = 1 \; .
\end{equation}
\pagebreak[2]
Die gesamte Funktion $p(x)$, die jedem Wert $x$ einen
Wahrscheinlichkeitsdichte zuordnet wir auch
\determ{Wahrscheinlichkeitsdichtefunktion} (\enterm{probability
density function}, \enterm[pdf|see{probability density
function}]{pdf}, oder kurz \enterm[density|see{probability density
function}]{density}) genannt. Die bekannteste
Wahrscheinlichkeitsdichtefunktion ist die der \determ{Normalverteilung}
\[ p_g(x) =
\frac{1}{\sqrt{2\pi\sigma^2}}e^{-\frac{(x-\mu)^2}{2\sigma^2}} \]
--- die \determ{Gau{\ss}sche-Glockenkurve} mit Mittelwert $\mu$ und
Standardabweichung $\sigma$.
The function $p(x)$, that assigns to every $x$ a probability density,
is called \enterm{probability density function},
\enterm[pdf|see{probability density function}]{pdf}, or just
\enterm[density|see{probability density function}]{density}
(\determ{Wahrscheinlichkeitsdichtefunktion}). The well known
\enterm{normal distribution} (\determ{Normalverteilung}) is an example of a
probability density function
\[ p_g(x) = \frac{1}{\sqrt{2\pi\sigma^2}}e^{-\frac{(x-\mu)^2}{2\sigma^2}} \]
--- the \enterm{Guassian distribution}
(\determ{Gau{\ss}sche-Glockenkurve}) with mean $\mu$ and standard
deviation $\sigma$.
The factor in front of the exponential function ensures the normalization to
$\int p_g(x) \, dx = 1$, \eqnref{pdfnorm}.
\newpage
\begin{exercise}{gaussianpdf.m}{gaussianpdf.out}
\begin{enumerate}
\item Plot the probability density of the normal distribution $p_g(x)$.
@ -288,6 +294,38 @@ Standardabweichung $\sigma$.
\end{enumerate}
\end{exercise}
\newpage
Histograms of real valued data depend on both the number of data
values and the chosen bin width. As in the example with the die
(\figref{diehistogramsfig} left), the height of the histogram gets
larger the larger the size of the data set. Also, as the bin width is
increased the hight of the histogram increases, because more data
values fall within each bin (\figref{pdfhistogramfig} left).
\begin{exercise}{gaussianbins.m}{}
Draw 100 random data from a Gaussian distribution and plot
histograms with different bin sizes of the data. What do you
observe?
\end{exercise}
To turn such histograms to estimates of probability densities they
need to be normalized such that according to \eqnref{pdfnorm} their
integral equals one. While histograms of categorial data are
normalized such that their sum equals one, here we need to integrate
over the histogram. The integral is the area (not the height) of the
histogram bars. Each bar has the height $n_i$ and the width $\Delta
x$. The total area $A$ of the histogram is thus
\[ A = \sum_{i=1}^N ( n_i \cdot \Delta x ) = \Delta x \sum_{i=1}^N n_i = N \, \Delta x \]
and the normalized histogram has the heights
\[ p(x_i) = \frac{n_i}{\Delta x \sum_{i=1}^N n_i} = \frac{n_i}{N
\Delta x} \; .\]
A histogram needs to be divided by both the sum of the frequencies
$n_i$ and the bin width $\Delta x$ to results in an estimate of the
corresponding probability density. Only then can the distribution be
compared with other distributions and in particular with theoretical
probability density functions like the one of the normal distribution
(\figref{pdfhistogramfig} right).
\begin{figure}[t]
\includegraphics[width=1\textwidth]{pdfhistogram}
\titlecaption{\label{pdfhistogramfig} Histograms with different bin
@ -300,36 +338,106 @@ Standardabweichung $\sigma$.
normal distributions (blue).}
\end{figure}
\pagebreak[4]
\begin{exercise}{gaussianbins.m}{}
Draw 100 random data from a Gaussian distribution and plot
histograms with different bin sizes of the data. What do you
observe?
\newpage
\begin{exercise}{gaussianbinsnorm.m}{}
Normalize the histogram of the previous exercise to a probability density.
\end{exercise}
Damit Histogramme von reellen Messwerten trotz unterschiedlicher
Anzahl von Messungen und unterschiedlicher Klassenbreiten
untereinander vergleichbar werden und mit bekannten
Wahrscheinlichkeitsdichtefunktionen verglichen werden k\"onnen,
m\"ussen sie auf das Integral Eins normiert werden
\eqnref{pdfnorm}. Das Integral (nicht die Summe) \"uber das Histogramm
soll Eins ergeben --- denn die Wahrscheinlichkeit, dass irgendeiner
der Messwerte auftritt mu{\ss} Eins sein. Das Integral ist die
Fl\"ache des Histogramms, die sich aus der Fl\"ache der einzelnen
Histogrammbalken zusammen setzt. Die Balken des Histogramms haben die
H\"ohe $n_i$ und die Breite $\Delta x$. Die Gesamtfl\"ache $A$ des
Histogramms ist also
\[ A = \sum_{i=1}^N ( n_i \cdot \Delta x ) = \Delta x \sum_{i=1}^N n_i \]
und das normierte Histogramm hat die H\"ohe
\[ p(x_i) = \frac{n_i}{\Delta x \sum_{i=1}^N n_i} \]
Es muss also nicht nur durch die Summe, sondern auch durch die Breite
$\Delta x$ der Klassen geteilt werden (\figref{pdfhistogramfig}).
\begin{exercise}{gaussianbinsnorm.m}{}
Normiere das Histogramm der vorherigen \"Ubung zu einer Wahrscheinlichkeitsdichte.
\newpage
\subsection{Kernel densities}
A problem of using histograms for estimating probability densities is
that the have hard bin edges. Depending on where the bin edges are placed
a data value falls in one or the other bin.
\begin{figure}[t]
\includegraphics[width=1\textwidth]{kerneldensity}
\titlecaption{\label{kerneldensityfig} Kernel densities.}{Left: The
histogram-based estimation of the probability density is dependent
also on the position of the bins. In the bottom plot the bins have
bin shifted by half a bin width (here $\Delta x=0.4$) and as a
result details of the probability density look different. Look,
for example at the height of the largest bin. Right: In contrast,
a kernel density is uniquely defined for a given kernel width
(here Gaussian kernels with standard deviation of $\sigma=2$).}
\end{figure}
To avoid this problem one can use so called \enterm {kernel densities}
for estimating probability densities from data. Here every data point
is replaced by a kernel (a function with integral one, like for
example the Gaussian function) that is moved exactly to the position
indicated by the data value. Then all the kernels of all the data
values are summed up, the sum is divided by the number of data values,
and we get an estimate of the probability density.
As for the histogram, where we need to choose a bin width, we need to
choose the width of the kernels appropriately.
\newpage
\begin{exercise}{gaussiankerneldensity.m}{}
Construct and plot a kernel density of the data from the previous
two exercises.
\end{exercise}
\subsection{Cumulative distributions}
The \enterm{cumulative distribution function},
\enterm[cdf|see{cumulative distribution function}]{cdf}, or
\enterm[cumulative density function|see{cumulative distribution
function}]{cumulative density function}
(\determ{kumulative Verteilung}) is the integral over the probability density
up to any value $x$:
\[ F(x) = \int_{-\infty}^x p(x') \, dx' \]
As such the cumulative distribution is a probability. It is the
probability of getting a value smaller than $x$.
For estimating the cumulative distribution from a set of data values
we do not need to rely on histograms or kernel densities. Instead, it
can be computed from the data directly without the need of a bin width
or width of a kernel. For a data set of $N$ data values $x_i$ the
probability of a data value smaller than $x$ is the number of data
points with values smaller than $x$ divided by $N$. If we sort the
data values than at each data value $x_i$ the number of data elements
smaller than $x_i$ is increased by one and the corresponding
probability of getting a value smaller than $x_i$ is increased by $1/N$.
That is, the cumulative distribution is
\[ F(x_i) = \frac{i}{N} \]
See \figref{cumulativefig} for an example.
The cumulative distribution tells you the fraction of data that are
below a certain value and can therefore be used to evaluate significance
from Null-hypothesis constructed from data, as it is done with bootstrap methods
(see chapter \ref{bootstrapchapter}). The other way around the values of quartiles
and percentiles can be determined from the inverse cumulative function.
\begin{figure}[t]
\includegraphics[width=1\textwidth]{cumulative}
\titlecaption{\label{cumulativefig} Estimation of the cumulative
distribution.}{The cumulative distribution $F(x)$ estimated from
100 data values drawn from a normal distribution (red) in
comparison to the true cumulative distribution function computed
by numerically integrating the normal distribution function
(blue). From the cumulative distribution function one can read of
the probabilities of getting values smaller than a given value
(here: $P(x \ge -1) \approx 0.15$). From the inverse cumulative
distribution the position of percentiles can be computed (here:
the median (50\,\% percentile) is as expected close to zero and
the third quartile (75\,\% percentile) at $x=0.68$.}
\end{figure}
\begin{exercise}{cumulative.m}{cumulative.out}
Generate 200 normally distributed data values and construct an
estimate of the cumulative distribution function from this data.
Compare this estimate with an integral over the normal distribution.
Use the estimate to compute the probability of having data values
smaller than $-1$.
Use the estimate to compute the value of the 5\,\% percentile.
\end{exercise}
\newpage
\section{Correlations}
Until now we described properties of univariate data sets. In
@ -353,7 +461,10 @@ data in a correlation coefficient close to zero
\begin{figure}[tp]
\includegraphics[width=1\textwidth]{correlation}
\titlecaption{\label{correlationfig} Korrelationen zwischen Datenpaaren.}{}
\titlecaption{\label{correlationfig} Correlations between pairs of
data.}{Shown are scatter plots of four data sets. Each point is a
single data pair. The correlation coefficient $r$ is given in the top
left of each plot.}
\end{figure}
\begin{exercise}{correlations.m}{}