diff --git a/bootstrap/lecture/bootstrap.tex b/bootstrap/lecture/bootstrap.tex index 30fd834..deeb0de 100644 --- a/bootstrap/lecture/bootstrap.tex +++ b/bootstrap/lecture/bootstrap.tex @@ -1,6 +1,7 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \chapter{\tr{Bootstrap Methods}{Bootstrap Methoden}} +\label{bootstrapchapter} Beim \determ{Bootstrap} erzeugt man sich die Verteilung von Statistiken durch Resampling aus der Stichprobe. Das hat mehrere Vorteile: diff --git a/scientificcomputing-script.tex b/scientificcomputing-script.tex index 4b7edde..325dbe8 100644 --- a/scientificcomputing-script.tex +++ b/scientificcomputing-script.tex @@ -46,7 +46,7 @@ \include{programmingstyle/lecture/programmingstyle} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\part{Grundlagen der Datenanalyse} +\part{Data analysis} \graphicspath{{statistics/lecture/}{statistics/lecture/figures/}} \lstset{inputpath=statistics/code} diff --git a/statistics/code/cumulative.m b/statistics/code/cumulative.m new file mode 100644 index 0000000..44ebbc5 --- /dev/null +++ b/statistics/code/cumulative.m @@ -0,0 +1,18 @@ +x = randn(200, 1); % generate some data +xs = sort(x); % sort the data +cdf = [1:length(x)]/length(x); % cumulative +plot(xs, cdf); +hold on; + +dx = 0.01; +xx = [-4:dx:4]; % x-values for Gaussian pdf +gauss = exp(-0.5*xx.^2)/sqrt(2.0*pi); % Gaussian pdf +gausscdf = cumsum(gauss)*dx; +plot(xx, gausscdf); +hold off; + +printf('data : probability of x<-1: %.2f\n', cdf(xs<-1.0)(end)) +printf('gauss: probability of x<-1: %.2f\n', gausscdf(xx<-1.0)(end)) +printf('\n') +printf('data : 5%% percentile at %.2f\n', xs(cdf<0.05)(end)) +printf('gauss: 5%% percentile at %.2f\n', xx(gausscdf<0.05)(end)) diff --git a/statistics/code/gaussianbins.m b/statistics/code/gaussianbins.m index 9e8c13e..3fe9987 100644 --- a/statistics/code/gaussianbins.m +++ b/statistics/code/gaussianbins.m @@ -1,11 +1,11 @@ x = randn(100, 1); % generate some data +db1 = 2; % large bin width +db2 = 0.5; % small bin width +bins1 = -4:db1:4; % large bins +bins2 = -4:db2:4; % small bins +[h1,b1] = hist(x, bins1); +[h2,b2] = hist(x, bins2); -bins1 = -4:2:4; % large bins -bins2 = -4:0.5:4; % small bins -[h1,b1] = hist(x,bins1); -[h2,b2] = hist(x,bins2); - -subplot( 1, 2, 1 ); bar(b1, h1) hold on bar(b2, h2, 'facecolor', 'r' ) diff --git a/statistics/code/gaussianbinsnorm.m b/statistics/code/gaussianbinsnorm.m index 3341bd0..8cdced2 100644 --- a/statistics/code/gaussianbinsnorm.m +++ b/statistics/code/gaussianbinsnorm.m @@ -1,9 +1,9 @@ hn1 = h1/sum(h1)/db1; hn2 = h2/sum(h2)/db2; -subplot( 1, 2, 2 ) -bar(b1,hn1) + +bar(b1, hn1) hold on -bar(b2,hn2, 'facecolor', 'r' ) +bar(b2, hn2, 'facecolor', 'r' ) xlabel('x') ylabel('Probability density') hold off diff --git a/statistics/code/gaussiankerneldensity.m b/statistics/code/gaussiankerneldensity.m new file mode 100644 index 0000000..4cb7925 --- /dev/null +++ b/statistics/code/gaussiankerneldensity.m @@ -0,0 +1,42 @@ +data = randn(100, 1); % generate some data +sigma = 0.2; % standard deviation of Gaussian kernel +xmin = -4.0; % minimum x value for kernel density +xmax = 4.0; % maximum x value for kernel density +dx = 0.05*sigma; % step size for kernel density +xg = [-4.0*sigma:dx:4.0*sigma]; % x-axis for single Gaussian kernel +% single Gaussian kernel: +kernel = exp(-0.5*(xg/sigma).^2)/sqrt(2.0*pi)/sigma; +ng = (length(kernel)-1)/2; % half the length of the Gaussian +x = [xmin:dx:xmax+0.5*dx]; % x-axis for kernel density +kd = zeros(1, length(x)); % vector for kernel density +for i = 1:length(data) % for every data value ... + xd = data(i); + % index of data value in kernel density vector: + inx = round((xd-xmin)/dx)+1; + % start index for Gaussian in kernel density vector: + k0 = inx-ng; + % end index for Gaussian in kernel density vector: + k1 = inx+ng; + g0 = 1; % start index in Gaussian + g1 = length(kernel); % end index in Gaussian + % check whether left side of Gaussian extends below xmin: + if inx < ng+1 + % adjust start indices accordingly: + k0 = 1; + g0 = ng-inx+1; + end + % check whether right side of Gaussian extends above xmax: + if inx > length(kd)-ng + % adjust end indices accordingly: + k1 = length(kd); + g1 = length(kernel)-(inx+ng-length(kd)); + end + % add Gaussian on kernel density: + kd(k0:k1) = kd(k0:k1) + kernel(g0:g1); +end +kd /= length(data); % normalize by number of data points + +% plot kernel density: +plot(x, kd) +xlabel('x') +ylabel('Probability density') diff --git a/statistics/lecture/cumulative.py b/statistics/lecture/cumulative.py new file mode 100644 index 0000000..7c74035 --- /dev/null +++ b/statistics/lecture/cumulative.py @@ -0,0 +1,52 @@ +import numpy as np +import matplotlib.pyplot as plt + +# data: +rng = np.random.RandomState(981) +data = rng.randn(100) +xs = np.sort(data) +cdf = np.arange(len(xs))/float(len(xs)) + +# Gauss: +dx = 0.01 +xx = np.arange(-4.0, 4.0, dx) +gauss = np.exp(-0.5*xx*xx)/np.sqrt(2.0*np.pi) +gausscdf = np.cumsum(gauss)*dx + +# plot: +plt.xkcd() +fig = plt.figure( figsize=(6, 2.6) ) +ax = fig.add_subplot(1, 1, 1) +ax.spines['right'].set_visible(False) +ax.spines['top'].set_visible(False) +ax.yaxis.set_ticks_position('left') +ax.xaxis.set_ticks_position('bottom') +ax.set_xlabel( 'x' ) +ax.set_xlim(-3.2, 3.2) +ax.set_xticks( np.arange( -3.0, 3.1, 1.0 ) ) +ax.set_ylabel( 'F(x)' ) +ax.set_ylim(-0.05, 1.05) +ax.set_yticks( np.arange( 0.0, 1.1, 0.2 ) ) + +med = xs[cdf>=0.5][0] +ax.plot([-3.2, med, med], [0.5, 0.5, 0.0], 'k', lw=1, zorder=-5) +ax.text(-2.8, 0.55, 'F=0.5') +ax.text(0.15, 0.25, 'median at %.2f' % med) + +q3 = xs[cdf>=0.75][0] +ax.plot([-3.2, q3, q3], [0.75, 0.75, 0.0], 'k', lw=1, zorder=-5) +ax.text(-2.8, 0.8, 'F=0.75') +ax.text(0.8, 0.5, '3. quartile at %.2f' % q3) + +p = cdf[xs>=-1.0][0] +ax.plot([-3.2, -1.0, -1.0], [p, p, 0.0], 'k', lw=1, zorder=-5) +ax.text(-2.8, 0.2, 'F=%.2f' % p) +ax.text(-0.9, 0.05, '-1') + +ax.plot(xx, gausscdf, '-', color='#0000ff', lw=2, zorder=-1) +ax.plot(xs, cdf, '-', color='#cc0000', lw=4, zorder=-1) +ax.plot([-3.2, 3.2], [1.0, 1.0], '--', color='k', lw=2, zorder=-10) + +plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.35, hspace=0.3) +fig.savefig( 'cumulative.pdf' ) +#plt.show() diff --git a/statistics/lecture/diehistograms.py b/statistics/lecture/diehistograms.py index ad53c38..622d39b 100644 --- a/statistics/lecture/diehistograms.py +++ b/statistics/lecture/diehistograms.py @@ -34,6 +34,6 @@ ax.set_ylim(0, 0.23) ax.set_ylabel( 'Probability' ) ax.plot([0.2, 6.8], [1.0/6.0, 1.0/6.0], '-b', lw=2, zorder=1) ax.hist([x2, x1], bins, normed=True, color=['#FFCC00', '#FFFF66' ], zorder=10) -plt.tight_layout() +plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.4, hspace=0.0) fig.savefig( 'diehistograms.pdf' ) #plt.show() diff --git a/statistics/lecture/kerneldensity.py b/statistics/lecture/kerneldensity.py new file mode 100644 index 0000000..6f93820 --- /dev/null +++ b/statistics/lecture/kerneldensity.py @@ -0,0 +1,83 @@ +import numpy as np +import matplotlib.pyplot as plt + +# normal distribution: +rng = np.random.RandomState(6281) +x = np.arange( -4.0, 4.0, 0.01 ) +g = np.exp(-0.5*x*x)/np.sqrt(2.0*np.pi) +r = rng.randn(100) + +def kerneldensity(data, xmin, xmax, sigma=1.0) : + dx = 0.05*sigma + xg = np.arange(-4.0*sigma, 4.0*sigma + 0.5*dx, dx) + gauss = np.exp(-0.5*xg*xg/sigma/sigma)/np.sqrt(2.0*np.pi)/sigma + ng = len(gauss)/2 + x = np.arange(xmin, xmax+0.5*dx, dx) + kd = np.zeros(len(x)) + for xd in data: + inx = int((xd-xmin)/dx) + k0 = inx-ng + k1 = inx+ng+1 + g0 = 0 + g1 = len(gauss) + if inx < ng: + k0 = 0 + g0 = ng-inx + if inx >= len(kd)-ng: + k1 = len(kd) + g1 = len(gauss)-(inx+ng-len(kd)+1) + kd[k0:k1] += gauss[g0:g1] + kd /= len(data) + return kd, x + + +plt.xkcd() + +fig = plt.figure( figsize=(6,3) ) +ax = fig.add_subplot(2, 2, 1) +ax.spines['right'].set_visible(False) +ax.spines['top'].set_visible(False) +ax.yaxis.set_ticks_position('left') +ax.xaxis.set_ticks_position('bottom') +ax.set_xlabel( 'x' ) +ax.set_xlim(-3.2, 3.2) +ax.set_xticks( np.arange( -3.0, 3.1, 1.0 ) ) +ax.set_ylabel( 'p(x)' ) +ax.set_ylim(0.0, 0.49) +ax.set_yticks( np.arange( 0.0, 0.41, 0.1 ) ) +#ax.plot(x, g, '-b', lw=2, zorder=-1) +ax.hist(r, np.arange(-4.1, 4, 0.4), normed=True, color='#FFCC00', zorder=-5) + +ax = fig.add_subplot(2, 2, 3) +ax.spines['right'].set_visible(False) +ax.spines['top'].set_visible(False) +ax.yaxis.set_ticks_position('left') +ax.xaxis.set_ticks_position('bottom') +ax.set_xlabel( 'x' ) +ax.set_xlim(-3.2, 3.2) +ax.set_xticks( np.arange( -3.0, 3.1, 1.0 ) ) +ax.set_ylabel( 'p(x)' ) +ax.set_ylim(0.0, 0.49) +ax.set_yticks( np.arange( 0.0, 0.41, 0.1 ) ) +#ax.plot(x, g, '-b', lw=2, zorder=-1) +ax.hist(r, np.arange(-4.3, 4, 0.4), normed=True, color='#FFCC00', zorder=-5) + +ax = fig.add_subplot(1, 2, 2) +ax.spines['right'].set_visible(False) +ax.spines['top'].set_visible(False) +ax.yaxis.set_ticks_position('left') +ax.xaxis.set_ticks_position('bottom') +ax.set_xlabel( 'x' ) +ax.set_xlim(-3.2, 3.2) +ax.set_xticks( np.arange( -3.0, 3.1, 1.0 ) ) +ax.set_ylabel( 'Probab. density p(x)' ) +ax.set_ylim(0.0, 0.49) +ax.set_yticks( np.arange( 0.0, 0.41, 0.1 ) ) +kd, xx = kerneldensity(r, -3.2, 3.2, 0.2) +ax.fill_between(xx, 0.0, kd, color='#FF9900', zorder=-5) +ax.plot(xx, kd, '-', lw=3, color='#CC0000', zorder=-1) + +plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.35, hspace=0.3) +fig.savefig( 'kerneldensity.pdf' ) +#plt.show() + diff --git a/statistics/lecture/pdfhistogram.py b/statistics/lecture/pdfhistogram.py index 4855217..e35606c 100644 --- a/statistics/lecture/pdfhistogram.py +++ b/statistics/lecture/pdfhistogram.py @@ -38,7 +38,7 @@ ax.plot(x, g, '-b', lw=2, zorder=-1) ax.hist(r, 5, normed=True, color='#CC0000', zorder=-10) ax.hist(r, 20, normed=True, color='#FFCC00', zorder=-5) -plt.tight_layout() +plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.4, hspace=0.0) fig.savefig( 'pdfhistogram.pdf' ) #plt.show() diff --git a/statistics/lecture/quartile.py b/statistics/lecture/quartile.py index f67133f..5d02797 100644 --- a/statistics/lecture/quartile.py +++ b/statistics/lecture/quartile.py @@ -7,7 +7,7 @@ g = np.exp(-0.5*x*x)/np.sqrt(2.0*np.pi) q = [ -0.67488, 0.0, 0.67488 ] plt.xkcd() -fig = plt.figure( figsize=(6,3.4) ) +fig = plt.figure( figsize=(6,3.2) ) ax = fig.add_subplot( 1, 1, 1 ) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) @@ -44,6 +44,7 @@ ax.plot(x,g, 'b', lw=4) ax.plot([0.0, 0.0], [0.0, 0.45], 'k', lw=2 ) ax.plot([q[0], q[0]], [0.0, 0.4], 'k', lw=2 ) ax.plot([q[2], q[2]], [0.0, 0.4], 'k', lw=2 ) -plt.tight_layout() +plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.4, hspace=0.0) +#plt.tight_layout() fig.savefig( 'quartile.pdf' ) #plt.show() diff --git a/statistics/lecture/statistics-chapter.tex b/statistics/lecture/statistics-chapter.tex index e90cfe1..8d97b6c 100644 --- a/statistics/lecture/statistics-chapter.tex +++ b/statistics/lecture/statistics-chapter.tex @@ -19,9 +19,6 @@ \section{TODO} \begin{itemize} \item Replace exercise 1.3 (boxwhisker) by one recreating figure 1. -\item Proper introduction to probabilities and densities first! -\item Cumulative propability -\item Kernel Histogramms (important for convolved PSTH)! \end{itemize} \end{document} diff --git a/statistics/lecture/statistics.tex b/statistics/lecture/statistics.tex index ca86b2e..1cdfeff 100644 --- a/statistics/lecture/statistics.tex +++ b/statistics/lecture/statistics.tex @@ -80,12 +80,12 @@ used to illustrate the standard deviation of the data \begin{figure}[t] \includegraphics[width=1\textwidth]{median} \titlecaption{\label{medianfig} Median, mean and mode of a - probability distribution.}{Left: Median, mean and mode are - identical for the symmetric and unimodal normal distribution. - Right: for asymmetric distributions these three measures differ. A - heavy tail of a distribution pulls out the mean most strongly. In - contrast, the median is more robust against heavy tails, but not - necessarily identical with the mode.} + probability distribution.}{Left: Median, mean and mode coincide + for the symmetric and unimodal normal distribution. Right: for + asymmetric distributions these three measures differ. A heavy tail + of a distribution pulls out the mean most strongly. In contrast, + the median is more robust against heavy tails, but not necessarily + identical with the mode.} \end{figure} The \enterm{mode} is the most frequent value, i.e. the position of the maximum of the probability distribution. @@ -113,7 +113,10 @@ not smaller than the median (\figref{medianfig}). \begin{figure}[t] \includegraphics[width=1\textwidth]{quartile} - \titlecaption{\label{quartilefig} Median and quartiles of a normal distribution.}{} + \titlecaption{\label{quartilefig} Median and quartiles of a normal + distribution.}{ The interquartile range between the first and the + third quartile contains 50\,\% of the data and contains the + median.} \end{figure} The distribution of data can be further characterized by the position @@ -164,7 +167,9 @@ The distribution of values in a data set is estimated by histograms $N=\sum_{i=1}^M n_i$ measurements in each of $M$ bins $i$ (\figref{diehistogramsfig} left). The bins tile the data range usually into intervals of the same size. The width of the bins is -called the bin width. +called the bin width. The frequencies $n_i$ plotted against the +categories $i$ is the \enterm{histogram}, or the \enterm{frequency + histogram}. \begin{figure}[t] \includegraphics[width=1\textwidth]{diehistograms} @@ -219,7 +224,7 @@ category $i$, i.e. of getting a data value in the $i$-th bin. \subsection{Probability densities functions} In cases where we deal with data sets of measurements of a real -quantity (e.g. the length of snakes, the weight of elephants, the time +quantity (e.g. lengths of snakes, weights of elephants, times between succeeding spikes) there is no natural bin width for computing a histogram. In addition, the probability of measuring a data value that equals exactly a specific real number like, e.g., 0.123456789 is zero, because @@ -230,7 +235,7 @@ range. For example, we can ask for the probability $P(1.2<x<1.3)$ to get a measurement between 0 and 1 (\figref{pdfprobabilitiesfig}). More generally, we want to know the probability $P(x_0<x<x_1)$ to obtain a measurement between $x_0$ and $x_1$. If we define the width of the -range defined by $x_0$ and $x_1$ is $\Delta x = x_1 - x_0$ then the +range between $x_0$ and $x_1$ as $\Delta x = x_1 - x_0$ then the probability can also be expressed as $P(x_0<x<x_0 + \Delta x)$. In the limit to very small ranges $\Delta x$ the probability of @@ -238,44 +243,45 @@ getting a measurement between $x_0$ and $x_0+\Delta x$ scales down to zero with $\Delta x$: \[ P(x_0<x<x_0+\Delta x) \approx p(x_0) \cdot \Delta x \; . \] In here the quantity $p(x_00)$ is a so called \enterm{probability - density}. This is not a unitless probability with values between 0 -and 1, but a number that takes on any positive real number and has as -a unit the inverse of the unit of the data values --- hence the name -``density''. + density} that is larger than zero and that described the +distribution of the data values. The probability density is not a +unitless probability with values between 0 and 1, but a number that +takes on any positive real number and has as a unit the inverse of the +unit of the data values --- hence the name ``density''. \begin{figure}[t] \includegraphics[width=1\textwidth]{pdfprobabilities} \titlecaption{\label{pdfprobabilitiesfig} Probability of a probability density.}{The probability of a data value $x$ between, - e.g., zero and one is the integral (red area) over the probability + e.g., zero and one is the integral (red area) of the probability density (blue).} \end{figure} The probability to get a value $x$ between $x_1$ and $x_2$ is -given by the integral over the probability density: +given by the integral of the probability density: \[ P(x_1 < x < x2) = \int\limits_{x_1}^{x_2} p(x) \, dx \; . \] -Because the probability to get any value $x$ is one, the integral over -the probability density - -Da die Wahrscheinlichkeit irgendeines Wertes $x$ Eins ergeben muss gilt die Normierung +Because the probability to get any value $x$ is one, the integral of +the probability density over the whole real axis must be one: \begin{equation} \label{pdfnorm} P(-\infty < x < \infty) = \int\limits_{-\infty}^{+\infty} p(x) \, dx = 1 \; . \end{equation} -\pagebreak[2] -Die gesamte Funktion $p(x)$, die jedem Wert $x$ einen -Wahrscheinlichkeitsdichte zuordnet wir auch -\determ{Wahrscheinlichkeitsdichtefunktion} (\enterm{probability - density function}, \enterm[pdf|see{probability density - function}]{pdf}, oder kurz \enterm[density|see{probability density - function}]{density}) genannt. Die bekannteste -Wahrscheinlichkeitsdichtefunktion ist die der \determ{Normalverteilung} -\[ p_g(x) = -\frac{1}{\sqrt{2\pi\sigma^2}}e^{-\frac{(x-\mu)^2}{2\sigma^2}} \] ---- die \determ{Gau{\ss}sche-Glockenkurve} mit Mittelwert $\mu$ und -Standardabweichung $\sigma$. - +The function $p(x)$, that assigns to every $x$ a probability density, +is called \enterm{probability density function}, +\enterm[pdf|see{probability density function}]{pdf}, or just +\enterm[density|see{probability density function}]{density} +(\determ{Wahrscheinlichkeitsdichtefunktion}). The well known +\enterm{normal distribution} (\determ{Normalverteilung}) is an example of a +probability density function +\[ p_g(x) = \frac{1}{\sqrt{2\pi\sigma^2}}e^{-\frac{(x-\mu)^2}{2\sigma^2}} \] +--- the \enterm{Guassian distribution} +(\determ{Gau{\ss}sche-Glockenkurve}) with mean $\mu$ and standard +deviation $\sigma$. +The factor in front of the exponential function ensures the normalization to +$\int p_g(x) \, dx = 1$, \eqnref{pdfnorm}. + +\newpage \begin{exercise}{gaussianpdf.m}{gaussianpdf.out} \begin{enumerate} \item Plot the probability density of the normal distribution $p_g(x)$. @@ -288,6 +294,38 @@ Standardabweichung $\sigma$. \end{enumerate} \end{exercise} +\newpage +Histograms of real valued data depend on both the number of data +values and the chosen bin width. As in the example with the die +(\figref{diehistogramsfig} left), the height of the histogram gets +larger the larger the size of the data set. Also, as the bin width is +increased the hight of the histogram increases, because more data +values fall within each bin (\figref{pdfhistogramfig} left). + +\begin{exercise}{gaussianbins.m}{} + Draw 100 random data from a Gaussian distribution and plot + histograms with different bin sizes of the data. What do you + observe? +\end{exercise} + +To turn such histograms to estimates of probability densities they +need to be normalized such that according to \eqnref{pdfnorm} their +integral equals one. While histograms of categorial data are +normalized such that their sum equals one, here we need to integrate +over the histogram. The integral is the area (not the height) of the +histogram bars. Each bar has the height $n_i$ and the width $\Delta +x$. The total area $A$ of the histogram is thus +\[ A = \sum_{i=1}^N ( n_i \cdot \Delta x ) = \Delta x \sum_{i=1}^N n_i = N \, \Delta x \] +and the normalized histogram has the heights +\[ p(x_i) = \frac{n_i}{\Delta x \sum_{i=1}^N n_i} = \frac{n_i}{N + \Delta x} \; .\] +A histogram needs to be divided by both the sum of the frequencies +$n_i$ and the bin width $\Delta x$ to results in an estimate of the +corresponding probability density. Only then can the distribution be +compared with other distributions and in particular with theoretical +probability density functions like the one of the normal distribution +(\figref{pdfhistogramfig} right). + \begin{figure}[t] \includegraphics[width=1\textwidth]{pdfhistogram} \titlecaption{\label{pdfhistogramfig} Histograms with different bin @@ -300,36 +338,106 @@ Standardabweichung $\sigma$. normal distributions (blue).} \end{figure} -\pagebreak[4] -\begin{exercise}{gaussianbins.m}{} - Draw 100 random data from a Gaussian distribution and plot - histograms with different bin sizes of the data. What do you - observe? +\newpage +\begin{exercise}{gaussianbinsnorm.m}{} + Normalize the histogram of the previous exercise to a probability density. \end{exercise} -Damit Histogramme von reellen Messwerten trotz unterschiedlicher -Anzahl von Messungen und unterschiedlicher Klassenbreiten -untereinander vergleichbar werden und mit bekannten -Wahrscheinlichkeitsdichtefunktionen verglichen werden k\"onnen, -m\"ussen sie auf das Integral Eins normiert werden -\eqnref{pdfnorm}. Das Integral (nicht die Summe) \"uber das Histogramm -soll Eins ergeben --- denn die Wahrscheinlichkeit, dass irgendeiner -der Messwerte auftritt mu{\ss} Eins sein. Das Integral ist die -Fl\"ache des Histogramms, die sich aus der Fl\"ache der einzelnen -Histogrammbalken zusammen setzt. Die Balken des Histogramms haben die -H\"ohe $n_i$ und die Breite $\Delta x$. Die Gesamtfl\"ache $A$ des -Histogramms ist also -\[ A = \sum_{i=1}^N ( n_i \cdot \Delta x ) = \Delta x \sum_{i=1}^N n_i \] -und das normierte Histogramm hat die H\"ohe -\[ p(x_i) = \frac{n_i}{\Delta x \sum_{i=1}^N n_i} \] -Es muss also nicht nur durch die Summe, sondern auch durch die Breite -$\Delta x$ der Klassen geteilt werden (\figref{pdfhistogramfig}). -\begin{exercise}{gaussianbinsnorm.m}{} - Normiere das Histogramm der vorherigen \"Ubung zu einer Wahrscheinlichkeitsdichte. +\newpage +\subsection{Kernel densities} + +A problem of using histograms for estimating probability densities is +that the have hard bin edges. Depending on where the bin edges are placed +a data value falls in one or the other bin. + +\begin{figure}[t] + \includegraphics[width=1\textwidth]{kerneldensity} + \titlecaption{\label{kerneldensityfig} Kernel densities.}{Left: The + histogram-based estimation of the probability density is dependent + also on the position of the bins. In the bottom plot the bins have + bin shifted by half a bin width (here $\Delta x=0.4$) and as a + result details of the probability density look different. Look, + for example at the height of the largest bin. Right: In contrast, + a kernel density is uniquely defined for a given kernel width + (here Gaussian kernels with standard deviation of $\sigma=2$).} +\end{figure} + +To avoid this problem one can use so called \enterm {kernel densities} +for estimating probability densities from data. Here every data point +is replaced by a kernel (a function with integral one, like for +example the Gaussian function) that is moved exactly to the position +indicated by the data value. Then all the kernels of all the data +values are summed up, the sum is divided by the number of data values, +and we get an estimate of the probability density. + +As for the histogram, where we need to choose a bin width, we need to +choose the width of the kernels appropriately. + +\newpage +\begin{exercise}{gaussiankerneldensity.m}{} + Construct and plot a kernel density of the data from the previous + two exercises. \end{exercise} +\subsection{Cumulative distributions} +The \enterm{cumulative distribution function}, +\enterm[cdf|see{cumulative distribution function}]{cdf}, or +\enterm[cumulative density function|see{cumulative distribution + function}]{cumulative density function} +(\determ{kumulative Verteilung}) is the integral over the probability density +up to any value $x$: +\[ F(x) = \int_{-\infty}^x p(x') \, dx' \] +As such the cumulative distribution is a probability. It is the +probability of getting a value smaller than $x$. + +For estimating the cumulative distribution from a set of data values +we do not need to rely on histograms or kernel densities. Instead, it +can be computed from the data directly without the need of a bin width +or width of a kernel. For a data set of $N$ data values $x_i$ the +probability of a data value smaller than $x$ is the number of data +points with values smaller than $x$ divided by $N$. If we sort the +data values than at each data value $x_i$ the number of data elements +smaller than $x_i$ is increased by one and the corresponding +probability of getting a value smaller than $x_i$ is increased by $1/N$. +That is, the cumulative distribution is +\[ F(x_i) = \frac{i}{N} \] +See \figref{cumulativefig} for an example. + +The cumulative distribution tells you the fraction of data that are +below a certain value and can therefore be used to evaluate significance +from Null-hypothesis constructed from data, as it is done with bootstrap methods +(see chapter \ref{bootstrapchapter}). The other way around the values of quartiles +and percentiles can be determined from the inverse cumulative function. + +\begin{figure}[t] + \includegraphics[width=1\textwidth]{cumulative} + \titlecaption{\label{cumulativefig} Estimation of the cumulative + distribution.}{The cumulative distribution $F(x)$ estimated from + 100 data values drawn from a normal distribution (red) in + comparison to the true cumulative distribution function computed + by numerically integrating the normal distribution function + (blue). From the cumulative distribution function one can read of + the probabilities of getting values smaller than a given value + (here: $P(x \ge -1) \approx 0.15$). From the inverse cumulative + distribution the position of percentiles can be computed (here: + the median (50\,\% percentile) is as expected close to zero and + the third quartile (75\,\% percentile) at $x=0.68$.} +\end{figure} + +\begin{exercise}{cumulative.m}{cumulative.out} + Generate 200 normally distributed data values and construct an + estimate of the cumulative distribution function from this data. + + Compare this estimate with an integral over the normal distribution. + + Use the estimate to compute the probability of having data values + smaller than $-1$. + + Use the estimate to compute the value of the 5\,\% percentile. +\end{exercise} +\newpage \section{Correlations} Until now we described properties of univariate data sets. In @@ -353,7 +461,10 @@ data in a correlation coefficient close to zero \begin{figure}[tp] \includegraphics[width=1\textwidth]{correlation} - \titlecaption{\label{correlationfig} Korrelationen zwischen Datenpaaren.}{} + \titlecaption{\label{correlationfig} Correlations between pairs of + data.}{Shown are scatter plots of four data sets. Each point is a + single data pair. The correlation coefficient $r$ is given in the top + left of each plot.} \end{figure} \begin{exercise}{correlations.m}{}