added section on cumulative densities
This commit is contained in:
parent
12a417d6bc
commit
6c95ec7256
@ -1,6 +1,7 @@
|
|||||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
\chapter{\tr{Bootstrap Methods}{Bootstrap Methoden}}
|
\chapter{\tr{Bootstrap Methods}{Bootstrap Methoden}}
|
||||||
|
\label{bootstrapchapter}
|
||||||
|
|
||||||
Beim \determ{Bootstrap} erzeugt man sich die Verteilung von Statistiken durch Resampling
|
Beim \determ{Bootstrap} erzeugt man sich die Verteilung von Statistiken durch Resampling
|
||||||
aus der Stichprobe. Das hat mehrere Vorteile:
|
aus der Stichprobe. Das hat mehrere Vorteile:
|
||||||
|
@ -46,7 +46,7 @@
|
|||||||
\include{programmingstyle/lecture/programmingstyle}
|
\include{programmingstyle/lecture/programmingstyle}
|
||||||
|
|
||||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
\part{Grundlagen der Datenanalyse}
|
\part{Data analysis}
|
||||||
|
|
||||||
\graphicspath{{statistics/lecture/}{statistics/lecture/figures/}}
|
\graphicspath{{statistics/lecture/}{statistics/lecture/figures/}}
|
||||||
\lstset{inputpath=statistics/code}
|
\lstset{inputpath=statistics/code}
|
||||||
|
18
statistics/code/cumulative.m
Normal file
18
statistics/code/cumulative.m
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
x = randn(200, 1); % generate some data
|
||||||
|
xs = sort(x); % sort the data
|
||||||
|
cdf = [1:length(x)]/length(x); % cumulative
|
||||||
|
plot(xs, cdf);
|
||||||
|
hold on;
|
||||||
|
|
||||||
|
dx = 0.01;
|
||||||
|
xx = [-4:dx:4]; % x-values for Gaussian pdf
|
||||||
|
gauss = exp(-0.5*xx.^2)/sqrt(2.0*pi); % Gaussian pdf
|
||||||
|
gausscdf = cumsum(gauss)*dx;
|
||||||
|
plot(xx, gausscdf);
|
||||||
|
hold off;
|
||||||
|
|
||||||
|
printf('data : probability of x<-1: %.2f\n', cdf(xs<-1.0)(end))
|
||||||
|
printf('gauss: probability of x<-1: %.2f\n', gausscdf(xx<-1.0)(end))
|
||||||
|
printf('\n')
|
||||||
|
printf('data : 5%% percentile at %.2f\n', xs(cdf<0.05)(end))
|
||||||
|
printf('gauss: 5%% percentile at %.2f\n', xx(gausscdf<0.05)(end))
|
@ -1,11 +1,11 @@
|
|||||||
x = randn(100, 1); % generate some data
|
x = randn(100, 1); % generate some data
|
||||||
|
db1 = 2; % large bin width
|
||||||
|
db2 = 0.5; % small bin width
|
||||||
|
bins1 = -4:db1:4; % large bins
|
||||||
|
bins2 = -4:db2:4; % small bins
|
||||||
|
[h1,b1] = hist(x, bins1);
|
||||||
|
[h2,b2] = hist(x, bins2);
|
||||||
|
|
||||||
bins1 = -4:2:4; % large bins
|
|
||||||
bins2 = -4:0.5:4; % small bins
|
|
||||||
[h1,b1] = hist(x,bins1);
|
|
||||||
[h2,b2] = hist(x,bins2);
|
|
||||||
|
|
||||||
subplot( 1, 2, 1 );
|
|
||||||
bar(b1, h1)
|
bar(b1, h1)
|
||||||
hold on
|
hold on
|
||||||
bar(b2, h2, 'facecolor', 'r' )
|
bar(b2, h2, 'facecolor', 'r' )
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
hn1 = h1/sum(h1)/db1;
|
hn1 = h1/sum(h1)/db1;
|
||||||
hn2 = h2/sum(h2)/db2;
|
hn2 = h2/sum(h2)/db2;
|
||||||
subplot( 1, 2, 2 )
|
|
||||||
bar(b1,hn1)
|
bar(b1, hn1)
|
||||||
hold on
|
hold on
|
||||||
bar(b2,hn2, 'facecolor', 'r' )
|
bar(b2, hn2, 'facecolor', 'r' )
|
||||||
xlabel('x')
|
xlabel('x')
|
||||||
ylabel('Probability density')
|
ylabel('Probability density')
|
||||||
hold off
|
hold off
|
||||||
|
42
statistics/code/gaussiankerneldensity.m
Normal file
42
statistics/code/gaussiankerneldensity.m
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
data = randn(100, 1); % generate some data
|
||||||
|
sigma = 0.2; % standard deviation of Gaussian kernel
|
||||||
|
xmin = -4.0; % minimum x value for kernel density
|
||||||
|
xmax = 4.0; % maximum x value for kernel density
|
||||||
|
dx = 0.05*sigma; % step size for kernel density
|
||||||
|
xg = [-4.0*sigma:dx:4.0*sigma]; % x-axis for single Gaussian kernel
|
||||||
|
% single Gaussian kernel:
|
||||||
|
kernel = exp(-0.5*(xg/sigma).^2)/sqrt(2.0*pi)/sigma;
|
||||||
|
ng = (length(kernel)-1)/2; % half the length of the Gaussian
|
||||||
|
x = [xmin:dx:xmax+0.5*dx]; % x-axis for kernel density
|
||||||
|
kd = zeros(1, length(x)); % vector for kernel density
|
||||||
|
for i = 1:length(data) % for every data value ...
|
||||||
|
xd = data(i);
|
||||||
|
% index of data value in kernel density vector:
|
||||||
|
inx = round((xd-xmin)/dx)+1;
|
||||||
|
% start index for Gaussian in kernel density vector:
|
||||||
|
k0 = inx-ng;
|
||||||
|
% end index for Gaussian in kernel density vector:
|
||||||
|
k1 = inx+ng;
|
||||||
|
g0 = 1; % start index in Gaussian
|
||||||
|
g1 = length(kernel); % end index in Gaussian
|
||||||
|
% check whether left side of Gaussian extends below xmin:
|
||||||
|
if inx < ng+1
|
||||||
|
% adjust start indices accordingly:
|
||||||
|
k0 = 1;
|
||||||
|
g0 = ng-inx+1;
|
||||||
|
end
|
||||||
|
% check whether right side of Gaussian extends above xmax:
|
||||||
|
if inx > length(kd)-ng
|
||||||
|
% adjust end indices accordingly:
|
||||||
|
k1 = length(kd);
|
||||||
|
g1 = length(kernel)-(inx+ng-length(kd));
|
||||||
|
end
|
||||||
|
% add Gaussian on kernel density:
|
||||||
|
kd(k0:k1) = kd(k0:k1) + kernel(g0:g1);
|
||||||
|
end
|
||||||
|
kd /= length(data); % normalize by number of data points
|
||||||
|
|
||||||
|
% plot kernel density:
|
||||||
|
plot(x, kd)
|
||||||
|
xlabel('x')
|
||||||
|
ylabel('Probability density')
|
52
statistics/lecture/cumulative.py
Normal file
52
statistics/lecture/cumulative.py
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
# data:
|
||||||
|
rng = np.random.RandomState(981)
|
||||||
|
data = rng.randn(100)
|
||||||
|
xs = np.sort(data)
|
||||||
|
cdf = np.arange(len(xs))/float(len(xs))
|
||||||
|
|
||||||
|
# Gauss:
|
||||||
|
dx = 0.01
|
||||||
|
xx = np.arange(-4.0, 4.0, dx)
|
||||||
|
gauss = np.exp(-0.5*xx*xx)/np.sqrt(2.0*np.pi)
|
||||||
|
gausscdf = np.cumsum(gauss)*dx
|
||||||
|
|
||||||
|
# plot:
|
||||||
|
plt.xkcd()
|
||||||
|
fig = plt.figure( figsize=(6, 2.6) )
|
||||||
|
ax = fig.add_subplot(1, 1, 1)
|
||||||
|
ax.spines['right'].set_visible(False)
|
||||||
|
ax.spines['top'].set_visible(False)
|
||||||
|
ax.yaxis.set_ticks_position('left')
|
||||||
|
ax.xaxis.set_ticks_position('bottom')
|
||||||
|
ax.set_xlabel( 'x' )
|
||||||
|
ax.set_xlim(-3.2, 3.2)
|
||||||
|
ax.set_xticks( np.arange( -3.0, 3.1, 1.0 ) )
|
||||||
|
ax.set_ylabel( 'F(x)' )
|
||||||
|
ax.set_ylim(-0.05, 1.05)
|
||||||
|
ax.set_yticks( np.arange( 0.0, 1.1, 0.2 ) )
|
||||||
|
|
||||||
|
med = xs[cdf>=0.5][0]
|
||||||
|
ax.plot([-3.2, med, med], [0.5, 0.5, 0.0], 'k', lw=1, zorder=-5)
|
||||||
|
ax.text(-2.8, 0.55, 'F=0.5')
|
||||||
|
ax.text(0.15, 0.25, 'median at %.2f' % med)
|
||||||
|
|
||||||
|
q3 = xs[cdf>=0.75][0]
|
||||||
|
ax.plot([-3.2, q3, q3], [0.75, 0.75, 0.0], 'k', lw=1, zorder=-5)
|
||||||
|
ax.text(-2.8, 0.8, 'F=0.75')
|
||||||
|
ax.text(0.8, 0.5, '3. quartile at %.2f' % q3)
|
||||||
|
|
||||||
|
p = cdf[xs>=-1.0][0]
|
||||||
|
ax.plot([-3.2, -1.0, -1.0], [p, p, 0.0], 'k', lw=1, zorder=-5)
|
||||||
|
ax.text(-2.8, 0.2, 'F=%.2f' % p)
|
||||||
|
ax.text(-0.9, 0.05, '-1')
|
||||||
|
|
||||||
|
ax.plot(xx, gausscdf, '-', color='#0000ff', lw=2, zorder=-1)
|
||||||
|
ax.plot(xs, cdf, '-', color='#cc0000', lw=4, zorder=-1)
|
||||||
|
ax.plot([-3.2, 3.2], [1.0, 1.0], '--', color='k', lw=2, zorder=-10)
|
||||||
|
|
||||||
|
plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.35, hspace=0.3)
|
||||||
|
fig.savefig( 'cumulative.pdf' )
|
||||||
|
#plt.show()
|
@ -34,6 +34,6 @@ ax.set_ylim(0, 0.23)
|
|||||||
ax.set_ylabel( 'Probability' )
|
ax.set_ylabel( 'Probability' )
|
||||||
ax.plot([0.2, 6.8], [1.0/6.0, 1.0/6.0], '-b', lw=2, zorder=1)
|
ax.plot([0.2, 6.8], [1.0/6.0, 1.0/6.0], '-b', lw=2, zorder=1)
|
||||||
ax.hist([x2, x1], bins, normed=True, color=['#FFCC00', '#FFFF66' ], zorder=10)
|
ax.hist([x2, x1], bins, normed=True, color=['#FFCC00', '#FFFF66' ], zorder=10)
|
||||||
plt.tight_layout()
|
plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.4, hspace=0.0)
|
||||||
fig.savefig( 'diehistograms.pdf' )
|
fig.savefig( 'diehistograms.pdf' )
|
||||||
#plt.show()
|
#plt.show()
|
||||||
|
83
statistics/lecture/kerneldensity.py
Normal file
83
statistics/lecture/kerneldensity.py
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
# normal distribution:
|
||||||
|
rng = np.random.RandomState(6281)
|
||||||
|
x = np.arange( -4.0, 4.0, 0.01 )
|
||||||
|
g = np.exp(-0.5*x*x)/np.sqrt(2.0*np.pi)
|
||||||
|
r = rng.randn(100)
|
||||||
|
|
||||||
|
def kerneldensity(data, xmin, xmax, sigma=1.0) :
|
||||||
|
dx = 0.05*sigma
|
||||||
|
xg = np.arange(-4.0*sigma, 4.0*sigma + 0.5*dx, dx)
|
||||||
|
gauss = np.exp(-0.5*xg*xg/sigma/sigma)/np.sqrt(2.0*np.pi)/sigma
|
||||||
|
ng = len(gauss)/2
|
||||||
|
x = np.arange(xmin, xmax+0.5*dx, dx)
|
||||||
|
kd = np.zeros(len(x))
|
||||||
|
for xd in data:
|
||||||
|
inx = int((xd-xmin)/dx)
|
||||||
|
k0 = inx-ng
|
||||||
|
k1 = inx+ng+1
|
||||||
|
g0 = 0
|
||||||
|
g1 = len(gauss)
|
||||||
|
if inx < ng:
|
||||||
|
k0 = 0
|
||||||
|
g0 = ng-inx
|
||||||
|
if inx >= len(kd)-ng:
|
||||||
|
k1 = len(kd)
|
||||||
|
g1 = len(gauss)-(inx+ng-len(kd)+1)
|
||||||
|
kd[k0:k1] += gauss[g0:g1]
|
||||||
|
kd /= len(data)
|
||||||
|
return kd, x
|
||||||
|
|
||||||
|
|
||||||
|
plt.xkcd()
|
||||||
|
|
||||||
|
fig = plt.figure( figsize=(6,3) )
|
||||||
|
ax = fig.add_subplot(2, 2, 1)
|
||||||
|
ax.spines['right'].set_visible(False)
|
||||||
|
ax.spines['top'].set_visible(False)
|
||||||
|
ax.yaxis.set_ticks_position('left')
|
||||||
|
ax.xaxis.set_ticks_position('bottom')
|
||||||
|
ax.set_xlabel( 'x' )
|
||||||
|
ax.set_xlim(-3.2, 3.2)
|
||||||
|
ax.set_xticks( np.arange( -3.0, 3.1, 1.0 ) )
|
||||||
|
ax.set_ylabel( 'p(x)' )
|
||||||
|
ax.set_ylim(0.0, 0.49)
|
||||||
|
ax.set_yticks( np.arange( 0.0, 0.41, 0.1 ) )
|
||||||
|
#ax.plot(x, g, '-b', lw=2, zorder=-1)
|
||||||
|
ax.hist(r, np.arange(-4.1, 4, 0.4), normed=True, color='#FFCC00', zorder=-5)
|
||||||
|
|
||||||
|
ax = fig.add_subplot(2, 2, 3)
|
||||||
|
ax.spines['right'].set_visible(False)
|
||||||
|
ax.spines['top'].set_visible(False)
|
||||||
|
ax.yaxis.set_ticks_position('left')
|
||||||
|
ax.xaxis.set_ticks_position('bottom')
|
||||||
|
ax.set_xlabel( 'x' )
|
||||||
|
ax.set_xlim(-3.2, 3.2)
|
||||||
|
ax.set_xticks( np.arange( -3.0, 3.1, 1.0 ) )
|
||||||
|
ax.set_ylabel( 'p(x)' )
|
||||||
|
ax.set_ylim(0.0, 0.49)
|
||||||
|
ax.set_yticks( np.arange( 0.0, 0.41, 0.1 ) )
|
||||||
|
#ax.plot(x, g, '-b', lw=2, zorder=-1)
|
||||||
|
ax.hist(r, np.arange(-4.3, 4, 0.4), normed=True, color='#FFCC00', zorder=-5)
|
||||||
|
|
||||||
|
ax = fig.add_subplot(1, 2, 2)
|
||||||
|
ax.spines['right'].set_visible(False)
|
||||||
|
ax.spines['top'].set_visible(False)
|
||||||
|
ax.yaxis.set_ticks_position('left')
|
||||||
|
ax.xaxis.set_ticks_position('bottom')
|
||||||
|
ax.set_xlabel( 'x' )
|
||||||
|
ax.set_xlim(-3.2, 3.2)
|
||||||
|
ax.set_xticks( np.arange( -3.0, 3.1, 1.0 ) )
|
||||||
|
ax.set_ylabel( 'Probab. density p(x)' )
|
||||||
|
ax.set_ylim(0.0, 0.49)
|
||||||
|
ax.set_yticks( np.arange( 0.0, 0.41, 0.1 ) )
|
||||||
|
kd, xx = kerneldensity(r, -3.2, 3.2, 0.2)
|
||||||
|
ax.fill_between(xx, 0.0, kd, color='#FF9900', zorder=-5)
|
||||||
|
ax.plot(xx, kd, '-', lw=3, color='#CC0000', zorder=-1)
|
||||||
|
|
||||||
|
plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.35, hspace=0.3)
|
||||||
|
fig.savefig( 'kerneldensity.pdf' )
|
||||||
|
#plt.show()
|
||||||
|
|
@ -38,7 +38,7 @@ ax.plot(x, g, '-b', lw=2, zorder=-1)
|
|||||||
ax.hist(r, 5, normed=True, color='#CC0000', zorder=-10)
|
ax.hist(r, 5, normed=True, color='#CC0000', zorder=-10)
|
||||||
ax.hist(r, 20, normed=True, color='#FFCC00', zorder=-5)
|
ax.hist(r, 20, normed=True, color='#FFCC00', zorder=-5)
|
||||||
|
|
||||||
plt.tight_layout()
|
plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.4, hspace=0.0)
|
||||||
fig.savefig( 'pdfhistogram.pdf' )
|
fig.savefig( 'pdfhistogram.pdf' )
|
||||||
#plt.show()
|
#plt.show()
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@ g = np.exp(-0.5*x*x)/np.sqrt(2.0*np.pi)
|
|||||||
q = [ -0.67488, 0.0, 0.67488 ]
|
q = [ -0.67488, 0.0, 0.67488 ]
|
||||||
|
|
||||||
plt.xkcd()
|
plt.xkcd()
|
||||||
fig = plt.figure( figsize=(6,3.4) )
|
fig = plt.figure( figsize=(6,3.2) )
|
||||||
ax = fig.add_subplot( 1, 1, 1 )
|
ax = fig.add_subplot( 1, 1, 1 )
|
||||||
ax.spines['right'].set_visible(False)
|
ax.spines['right'].set_visible(False)
|
||||||
ax.spines['top'].set_visible(False)
|
ax.spines['top'].set_visible(False)
|
||||||
@ -44,6 +44,7 @@ ax.plot(x,g, 'b', lw=4)
|
|||||||
ax.plot([0.0, 0.0], [0.0, 0.45], 'k', lw=2 )
|
ax.plot([0.0, 0.0], [0.0, 0.45], 'k', lw=2 )
|
||||||
ax.plot([q[0], q[0]], [0.0, 0.4], 'k', lw=2 )
|
ax.plot([q[0], q[0]], [0.0, 0.4], 'k', lw=2 )
|
||||||
ax.plot([q[2], q[2]], [0.0, 0.4], 'k', lw=2 )
|
ax.plot([q[2], q[2]], [0.0, 0.4], 'k', lw=2 )
|
||||||
plt.tight_layout()
|
plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.4, hspace=0.0)
|
||||||
|
#plt.tight_layout()
|
||||||
fig.savefig( 'quartile.pdf' )
|
fig.savefig( 'quartile.pdf' )
|
||||||
#plt.show()
|
#plt.show()
|
||||||
|
@ -19,9 +19,6 @@
|
|||||||
\section{TODO}
|
\section{TODO}
|
||||||
\begin{itemize}
|
\begin{itemize}
|
||||||
\item Replace exercise 1.3 (boxwhisker) by one recreating figure 1.
|
\item Replace exercise 1.3 (boxwhisker) by one recreating figure 1.
|
||||||
\item Proper introduction to probabilities and densities first!
|
|
||||||
\item Cumulative propability
|
|
||||||
\item Kernel Histogramms (important for convolved PSTH)!
|
|
||||||
\end{itemize}
|
\end{itemize}
|
||||||
|
|
||||||
\end{document}
|
\end{document}
|
||||||
|
@ -80,12 +80,12 @@ used to illustrate the standard deviation of the data
|
|||||||
\begin{figure}[t]
|
\begin{figure}[t]
|
||||||
\includegraphics[width=1\textwidth]{median}
|
\includegraphics[width=1\textwidth]{median}
|
||||||
\titlecaption{\label{medianfig} Median, mean and mode of a
|
\titlecaption{\label{medianfig} Median, mean and mode of a
|
||||||
probability distribution.}{Left: Median, mean and mode are
|
probability distribution.}{Left: Median, mean and mode coincide
|
||||||
identical for the symmetric and unimodal normal distribution.
|
for the symmetric and unimodal normal distribution. Right: for
|
||||||
Right: for asymmetric distributions these three measures differ. A
|
asymmetric distributions these three measures differ. A heavy tail
|
||||||
heavy tail of a distribution pulls out the mean most strongly. In
|
of a distribution pulls out the mean most strongly. In contrast,
|
||||||
contrast, the median is more robust against heavy tails, but not
|
the median is more robust against heavy tails, but not necessarily
|
||||||
necessarily identical with the mode.}
|
identical with the mode.}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
The \enterm{mode} is the most frequent value, i.e. the position of the maximum of the probability distribution.
|
The \enterm{mode} is the most frequent value, i.e. the position of the maximum of the probability distribution.
|
||||||
@ -113,7 +113,10 @@ not smaller than the median (\figref{medianfig}).
|
|||||||
|
|
||||||
\begin{figure}[t]
|
\begin{figure}[t]
|
||||||
\includegraphics[width=1\textwidth]{quartile}
|
\includegraphics[width=1\textwidth]{quartile}
|
||||||
\titlecaption{\label{quartilefig} Median and quartiles of a normal distribution.}{}
|
\titlecaption{\label{quartilefig} Median and quartiles of a normal
|
||||||
|
distribution.}{ The interquartile range between the first and the
|
||||||
|
third quartile contains 50\,\% of the data and contains the
|
||||||
|
median.}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
The distribution of data can be further characterized by the position
|
The distribution of data can be further characterized by the position
|
||||||
@ -164,7 +167,9 @@ The distribution of values in a data set is estimated by histograms
|
|||||||
$N=\sum_{i=1}^M n_i$ measurements in each of $M$ bins $i$
|
$N=\sum_{i=1}^M n_i$ measurements in each of $M$ bins $i$
|
||||||
(\figref{diehistogramsfig} left). The bins tile the data range
|
(\figref{diehistogramsfig} left). The bins tile the data range
|
||||||
usually into intervals of the same size. The width of the bins is
|
usually into intervals of the same size. The width of the bins is
|
||||||
called the bin width.
|
called the bin width. The frequencies $n_i$ plotted against the
|
||||||
|
categories $i$ is the \enterm{histogram}, or the \enterm{frequency
|
||||||
|
histogram}.
|
||||||
|
|
||||||
\begin{figure}[t]
|
\begin{figure}[t]
|
||||||
\includegraphics[width=1\textwidth]{diehistograms}
|
\includegraphics[width=1\textwidth]{diehistograms}
|
||||||
@ -219,7 +224,7 @@ category $i$, i.e. of getting a data value in the $i$-th bin.
|
|||||||
\subsection{Probability densities functions}
|
\subsection{Probability densities functions}
|
||||||
|
|
||||||
In cases where we deal with data sets of measurements of a real
|
In cases where we deal with data sets of measurements of a real
|
||||||
quantity (e.g. the length of snakes, the weight of elephants, the time
|
quantity (e.g. lengths of snakes, weights of elephants, times
|
||||||
between succeeding spikes) there is no natural bin width for computing
|
between succeeding spikes) there is no natural bin width for computing
|
||||||
a histogram. In addition, the probability of measuring a data value that
|
a histogram. In addition, the probability of measuring a data value that
|
||||||
equals exactly a specific real number like, e.g., 0.123456789 is zero, because
|
equals exactly a specific real number like, e.g., 0.123456789 is zero, because
|
||||||
@ -230,7 +235,7 @@ range. For example, we can ask for the probability $P(1.2<x<1.3)$ to
|
|||||||
get a measurement between 0 and 1 (\figref{pdfprobabilitiesfig}). More
|
get a measurement between 0 and 1 (\figref{pdfprobabilitiesfig}). More
|
||||||
generally, we want to know the probability $P(x_0<x<x_1)$ to obtain a
|
generally, we want to know the probability $P(x_0<x<x_1)$ to obtain a
|
||||||
measurement between $x_0$ and $x_1$. If we define the width of the
|
measurement between $x_0$ and $x_1$. If we define the width of the
|
||||||
range defined by $x_0$ and $x_1$ is $\Delta x = x_1 - x_0$ then the
|
range between $x_0$ and $x_1$ as $\Delta x = x_1 - x_0$ then the
|
||||||
probability can also be expressed as $P(x_0<x<x_0 + \Delta x)$.
|
probability can also be expressed as $P(x_0<x<x_0 + \Delta x)$.
|
||||||
|
|
||||||
In the limit to very small ranges $\Delta x$ the probability of
|
In the limit to very small ranges $\Delta x$ the probability of
|
||||||
@ -238,44 +243,45 @@ getting a measurement between $x_0$ and $x_0+\Delta x$ scales down to
|
|||||||
zero with $\Delta x$:
|
zero with $\Delta x$:
|
||||||
\[ P(x_0<x<x_0+\Delta x) \approx p(x_0) \cdot \Delta x \; . \]
|
\[ P(x_0<x<x_0+\Delta x) \approx p(x_0) \cdot \Delta x \; . \]
|
||||||
In here the quantity $p(x_00)$ is a so called \enterm{probability
|
In here the quantity $p(x_00)$ is a so called \enterm{probability
|
||||||
density}. This is not a unitless probability with values between 0
|
density} that is larger than zero and that described the
|
||||||
and 1, but a number that takes on any positive real number and has as
|
distribution of the data values. The probability density is not a
|
||||||
a unit the inverse of the unit of the data values --- hence the name
|
unitless probability with values between 0 and 1, but a number that
|
||||||
``density''.
|
takes on any positive real number and has as a unit the inverse of the
|
||||||
|
unit of the data values --- hence the name ``density''.
|
||||||
|
|
||||||
\begin{figure}[t]
|
\begin{figure}[t]
|
||||||
\includegraphics[width=1\textwidth]{pdfprobabilities}
|
\includegraphics[width=1\textwidth]{pdfprobabilities}
|
||||||
\titlecaption{\label{pdfprobabilitiesfig} Probability of a
|
\titlecaption{\label{pdfprobabilitiesfig} Probability of a
|
||||||
probability density.}{The probability of a data value $x$ between,
|
probability density.}{The probability of a data value $x$ between,
|
||||||
e.g., zero and one is the integral (red area) over the probability
|
e.g., zero and one is the integral (red area) of the probability
|
||||||
density (blue).}
|
density (blue).}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
The probability to get a value $x$ between $x_1$ and $x_2$ is
|
The probability to get a value $x$ between $x_1$ and $x_2$ is
|
||||||
given by the integral over the probability density:
|
given by the integral of the probability density:
|
||||||
\[ P(x_1 < x < x2) = \int\limits_{x_1}^{x_2} p(x) \, dx \; . \]
|
\[ P(x_1 < x < x2) = \int\limits_{x_1}^{x_2} p(x) \, dx \; . \]
|
||||||
Because the probability to get any value $x$ is one, the integral over
|
Because the probability to get any value $x$ is one, the integral of
|
||||||
the probability density
|
the probability density over the whole real axis must be one:
|
||||||
|
|
||||||
Da die Wahrscheinlichkeit irgendeines Wertes $x$ Eins ergeben muss gilt die Normierung
|
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\label{pdfnorm}
|
\label{pdfnorm}
|
||||||
P(-\infty < x < \infty) = \int\limits_{-\infty}^{+\infty} p(x) \, dx = 1 \; .
|
P(-\infty < x < \infty) = \int\limits_{-\infty}^{+\infty} p(x) \, dx = 1 \; .
|
||||||
\end{equation}
|
\end{equation}
|
||||||
|
|
||||||
\pagebreak[2]
|
The function $p(x)$, that assigns to every $x$ a probability density,
|
||||||
Die gesamte Funktion $p(x)$, die jedem Wert $x$ einen
|
is called \enterm{probability density function},
|
||||||
Wahrscheinlichkeitsdichte zuordnet wir auch
|
\enterm[pdf|see{probability density function}]{pdf}, or just
|
||||||
\determ{Wahrscheinlichkeitsdichtefunktion} (\enterm{probability
|
\enterm[density|see{probability density function}]{density}
|
||||||
density function}, \enterm[pdf|see{probability density
|
(\determ{Wahrscheinlichkeitsdichtefunktion}). The well known
|
||||||
function}]{pdf}, oder kurz \enterm[density|see{probability density
|
\enterm{normal distribution} (\determ{Normalverteilung}) is an example of a
|
||||||
function}]{density}) genannt. Die bekannteste
|
probability density function
|
||||||
Wahrscheinlichkeitsdichtefunktion ist die der \determ{Normalverteilung}
|
\[ p_g(x) = \frac{1}{\sqrt{2\pi\sigma^2}}e^{-\frac{(x-\mu)^2}{2\sigma^2}} \]
|
||||||
\[ p_g(x) =
|
--- the \enterm{Guassian distribution}
|
||||||
\frac{1}{\sqrt{2\pi\sigma^2}}e^{-\frac{(x-\mu)^2}{2\sigma^2}} \]
|
(\determ{Gau{\ss}sche-Glockenkurve}) with mean $\mu$ and standard
|
||||||
--- die \determ{Gau{\ss}sche-Glockenkurve} mit Mittelwert $\mu$ und
|
deviation $\sigma$.
|
||||||
Standardabweichung $\sigma$.
|
The factor in front of the exponential function ensures the normalization to
|
||||||
|
$\int p_g(x) \, dx = 1$, \eqnref{pdfnorm}.
|
||||||
|
|
||||||
|
\newpage
|
||||||
\begin{exercise}{gaussianpdf.m}{gaussianpdf.out}
|
\begin{exercise}{gaussianpdf.m}{gaussianpdf.out}
|
||||||
\begin{enumerate}
|
\begin{enumerate}
|
||||||
\item Plot the probability density of the normal distribution $p_g(x)$.
|
\item Plot the probability density of the normal distribution $p_g(x)$.
|
||||||
@ -288,6 +294,38 @@ Standardabweichung $\sigma$.
|
|||||||
\end{enumerate}
|
\end{enumerate}
|
||||||
\end{exercise}
|
\end{exercise}
|
||||||
|
|
||||||
|
\newpage
|
||||||
|
Histograms of real valued data depend on both the number of data
|
||||||
|
values and the chosen bin width. As in the example with the die
|
||||||
|
(\figref{diehistogramsfig} left), the height of the histogram gets
|
||||||
|
larger the larger the size of the data set. Also, as the bin width is
|
||||||
|
increased the hight of the histogram increases, because more data
|
||||||
|
values fall within each bin (\figref{pdfhistogramfig} left).
|
||||||
|
|
||||||
|
\begin{exercise}{gaussianbins.m}{}
|
||||||
|
Draw 100 random data from a Gaussian distribution and plot
|
||||||
|
histograms with different bin sizes of the data. What do you
|
||||||
|
observe?
|
||||||
|
\end{exercise}
|
||||||
|
|
||||||
|
To turn such histograms to estimates of probability densities they
|
||||||
|
need to be normalized such that according to \eqnref{pdfnorm} their
|
||||||
|
integral equals one. While histograms of categorial data are
|
||||||
|
normalized such that their sum equals one, here we need to integrate
|
||||||
|
over the histogram. The integral is the area (not the height) of the
|
||||||
|
histogram bars. Each bar has the height $n_i$ and the width $\Delta
|
||||||
|
x$. The total area $A$ of the histogram is thus
|
||||||
|
\[ A = \sum_{i=1}^N ( n_i \cdot \Delta x ) = \Delta x \sum_{i=1}^N n_i = N \, \Delta x \]
|
||||||
|
and the normalized histogram has the heights
|
||||||
|
\[ p(x_i) = \frac{n_i}{\Delta x \sum_{i=1}^N n_i} = \frac{n_i}{N
|
||||||
|
\Delta x} \; .\]
|
||||||
|
A histogram needs to be divided by both the sum of the frequencies
|
||||||
|
$n_i$ and the bin width $\Delta x$ to results in an estimate of the
|
||||||
|
corresponding probability density. Only then can the distribution be
|
||||||
|
compared with other distributions and in particular with theoretical
|
||||||
|
probability density functions like the one of the normal distribution
|
||||||
|
(\figref{pdfhistogramfig} right).
|
||||||
|
|
||||||
\begin{figure}[t]
|
\begin{figure}[t]
|
||||||
\includegraphics[width=1\textwidth]{pdfhistogram}
|
\includegraphics[width=1\textwidth]{pdfhistogram}
|
||||||
\titlecaption{\label{pdfhistogramfig} Histograms with different bin
|
\titlecaption{\label{pdfhistogramfig} Histograms with different bin
|
||||||
@ -300,36 +338,106 @@ Standardabweichung $\sigma$.
|
|||||||
normal distributions (blue).}
|
normal distributions (blue).}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
\pagebreak[4]
|
\newpage
|
||||||
\begin{exercise}{gaussianbins.m}{}
|
\begin{exercise}{gaussianbinsnorm.m}{}
|
||||||
Draw 100 random data from a Gaussian distribution and plot
|
Normalize the histogram of the previous exercise to a probability density.
|
||||||
histograms with different bin sizes of the data. What do you
|
|
||||||
observe?
|
|
||||||
\end{exercise}
|
\end{exercise}
|
||||||
|
|
||||||
Damit Histogramme von reellen Messwerten trotz unterschiedlicher
|
|
||||||
Anzahl von Messungen und unterschiedlicher Klassenbreiten
|
|
||||||
untereinander vergleichbar werden und mit bekannten
|
|
||||||
Wahrscheinlichkeitsdichtefunktionen verglichen werden k\"onnen,
|
|
||||||
m\"ussen sie auf das Integral Eins normiert werden
|
|
||||||
\eqnref{pdfnorm}. Das Integral (nicht die Summe) \"uber das Histogramm
|
|
||||||
soll Eins ergeben --- denn die Wahrscheinlichkeit, dass irgendeiner
|
|
||||||
der Messwerte auftritt mu{\ss} Eins sein. Das Integral ist die
|
|
||||||
Fl\"ache des Histogramms, die sich aus der Fl\"ache der einzelnen
|
|
||||||
Histogrammbalken zusammen setzt. Die Balken des Histogramms haben die
|
|
||||||
H\"ohe $n_i$ und die Breite $\Delta x$. Die Gesamtfl\"ache $A$ des
|
|
||||||
Histogramms ist also
|
|
||||||
\[ A = \sum_{i=1}^N ( n_i \cdot \Delta x ) = \Delta x \sum_{i=1}^N n_i \]
|
|
||||||
und das normierte Histogramm hat die H\"ohe
|
|
||||||
\[ p(x_i) = \frac{n_i}{\Delta x \sum_{i=1}^N n_i} \]
|
|
||||||
Es muss also nicht nur durch die Summe, sondern auch durch die Breite
|
|
||||||
$\Delta x$ der Klassen geteilt werden (\figref{pdfhistogramfig}).
|
|
||||||
|
|
||||||
\begin{exercise}{gaussianbinsnorm.m}{}
|
\newpage
|
||||||
Normiere das Histogramm der vorherigen \"Ubung zu einer Wahrscheinlichkeitsdichte.
|
\subsection{Kernel densities}
|
||||||
|
|
||||||
|
A problem of using histograms for estimating probability densities is
|
||||||
|
that the have hard bin edges. Depending on where the bin edges are placed
|
||||||
|
a data value falls in one or the other bin.
|
||||||
|
|
||||||
|
\begin{figure}[t]
|
||||||
|
\includegraphics[width=1\textwidth]{kerneldensity}
|
||||||
|
\titlecaption{\label{kerneldensityfig} Kernel densities.}{Left: The
|
||||||
|
histogram-based estimation of the probability density is dependent
|
||||||
|
also on the position of the bins. In the bottom plot the bins have
|
||||||
|
bin shifted by half a bin width (here $\Delta x=0.4$) and as a
|
||||||
|
result details of the probability density look different. Look,
|
||||||
|
for example at the height of the largest bin. Right: In contrast,
|
||||||
|
a kernel density is uniquely defined for a given kernel width
|
||||||
|
(here Gaussian kernels with standard deviation of $\sigma=2$).}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
To avoid this problem one can use so called \enterm {kernel densities}
|
||||||
|
for estimating probability densities from data. Here every data point
|
||||||
|
is replaced by a kernel (a function with integral one, like for
|
||||||
|
example the Gaussian function) that is moved exactly to the position
|
||||||
|
indicated by the data value. Then all the kernels of all the data
|
||||||
|
values are summed up, the sum is divided by the number of data values,
|
||||||
|
and we get an estimate of the probability density.
|
||||||
|
|
||||||
|
As for the histogram, where we need to choose a bin width, we need to
|
||||||
|
choose the width of the kernels appropriately.
|
||||||
|
|
||||||
|
\newpage
|
||||||
|
\begin{exercise}{gaussiankerneldensity.m}{}
|
||||||
|
Construct and plot a kernel density of the data from the previous
|
||||||
|
two exercises.
|
||||||
\end{exercise}
|
\end{exercise}
|
||||||
|
|
||||||
|
\subsection{Cumulative distributions}
|
||||||
|
The \enterm{cumulative distribution function},
|
||||||
|
\enterm[cdf|see{cumulative distribution function}]{cdf}, or
|
||||||
|
\enterm[cumulative density function|see{cumulative distribution
|
||||||
|
function}]{cumulative density function}
|
||||||
|
(\determ{kumulative Verteilung}) is the integral over the probability density
|
||||||
|
up to any value $x$:
|
||||||
|
\[ F(x) = \int_{-\infty}^x p(x') \, dx' \]
|
||||||
|
As such the cumulative distribution is a probability. It is the
|
||||||
|
probability of getting a value smaller than $x$.
|
||||||
|
|
||||||
|
For estimating the cumulative distribution from a set of data values
|
||||||
|
we do not need to rely on histograms or kernel densities. Instead, it
|
||||||
|
can be computed from the data directly without the need of a bin width
|
||||||
|
or width of a kernel. For a data set of $N$ data values $x_i$ the
|
||||||
|
probability of a data value smaller than $x$ is the number of data
|
||||||
|
points with values smaller than $x$ divided by $N$. If we sort the
|
||||||
|
data values than at each data value $x_i$ the number of data elements
|
||||||
|
smaller than $x_i$ is increased by one and the corresponding
|
||||||
|
probability of getting a value smaller than $x_i$ is increased by $1/N$.
|
||||||
|
That is, the cumulative distribution is
|
||||||
|
\[ F(x_i) = \frac{i}{N} \]
|
||||||
|
See \figref{cumulativefig} for an example.
|
||||||
|
|
||||||
|
The cumulative distribution tells you the fraction of data that are
|
||||||
|
below a certain value and can therefore be used to evaluate significance
|
||||||
|
from Null-hypothesis constructed from data, as it is done with bootstrap methods
|
||||||
|
(see chapter \ref{bootstrapchapter}). The other way around the values of quartiles
|
||||||
|
and percentiles can be determined from the inverse cumulative function.
|
||||||
|
|
||||||
|
\begin{figure}[t]
|
||||||
|
\includegraphics[width=1\textwidth]{cumulative}
|
||||||
|
\titlecaption{\label{cumulativefig} Estimation of the cumulative
|
||||||
|
distribution.}{The cumulative distribution $F(x)$ estimated from
|
||||||
|
100 data values drawn from a normal distribution (red) in
|
||||||
|
comparison to the true cumulative distribution function computed
|
||||||
|
by numerically integrating the normal distribution function
|
||||||
|
(blue). From the cumulative distribution function one can read of
|
||||||
|
the probabilities of getting values smaller than a given value
|
||||||
|
(here: $P(x \ge -1) \approx 0.15$). From the inverse cumulative
|
||||||
|
distribution the position of percentiles can be computed (here:
|
||||||
|
the median (50\,\% percentile) is as expected close to zero and
|
||||||
|
the third quartile (75\,\% percentile) at $x=0.68$.}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
\begin{exercise}{cumulative.m}{cumulative.out}
|
||||||
|
Generate 200 normally distributed data values and construct an
|
||||||
|
estimate of the cumulative distribution function from this data.
|
||||||
|
|
||||||
|
Compare this estimate with an integral over the normal distribution.
|
||||||
|
|
||||||
|
Use the estimate to compute the probability of having data values
|
||||||
|
smaller than $-1$.
|
||||||
|
|
||||||
|
Use the estimate to compute the value of the 5\,\% percentile.
|
||||||
|
\end{exercise}
|
||||||
|
|
||||||
|
\newpage
|
||||||
\section{Correlations}
|
\section{Correlations}
|
||||||
|
|
||||||
Until now we described properties of univariate data sets. In
|
Until now we described properties of univariate data sets. In
|
||||||
@ -353,7 +461,10 @@ data in a correlation coefficient close to zero
|
|||||||
|
|
||||||
\begin{figure}[tp]
|
\begin{figure}[tp]
|
||||||
\includegraphics[width=1\textwidth]{correlation}
|
\includegraphics[width=1\textwidth]{correlation}
|
||||||
\titlecaption{\label{correlationfig} Korrelationen zwischen Datenpaaren.}{}
|
\titlecaption{\label{correlationfig} Correlations between pairs of
|
||||||
|
data.}{Shown are scatter plots of four data sets. Each point is a
|
||||||
|
single data pair. The correlation coefficient $r$ is given in the top
|
||||||
|
left of each plot.}
|
||||||
\end{figure}
|
\end{figure}
|
||||||
|
|
||||||
\begin{exercise}{correlations.m}{}
|
\begin{exercise}{correlations.m}{}
|
||||||
|
Reference in New Issue
Block a user