From 6c95ec7256b92e523e815364a0eca36bd9c5af6b Mon Sep 17 00:00:00 2001 From: Jan Benda Date: Sat, 25 Nov 2017 18:46:03 +0100 Subject: [PATCH] added section on cumulative densities --- bootstrap/lecture/bootstrap.tex | 1 + scientificcomputing-script.tex | 2 +- statistics/code/cumulative.m | 18 ++ statistics/code/gaussianbins.m | 12 +- statistics/code/gaussianbinsnorm.m | 6 +- statistics/code/gaussiankerneldensity.m | 42 ++++ statistics/lecture/cumulative.py | 52 +++++ statistics/lecture/diehistograms.py | 2 +- statistics/lecture/kerneldensity.py | 83 ++++++++ statistics/lecture/pdfhistogram.py | 2 +- statistics/lecture/quartile.py | 5 +- statistics/lecture/statistics-chapter.tex | 3 - statistics/lecture/statistics.tex | 227 ++++++++++++++++------ 13 files changed, 380 insertions(+), 75 deletions(-) create mode 100644 statistics/code/cumulative.m create mode 100644 statistics/code/gaussiankerneldensity.m create mode 100644 statistics/lecture/cumulative.py create mode 100644 statistics/lecture/kerneldensity.py diff --git a/bootstrap/lecture/bootstrap.tex b/bootstrap/lecture/bootstrap.tex index 30fd834..deeb0de 100644 --- a/bootstrap/lecture/bootstrap.tex +++ b/bootstrap/lecture/bootstrap.tex @@ -1,6 +1,7 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \chapter{\tr{Bootstrap Methods}{Bootstrap Methoden}} +\label{bootstrapchapter} Beim \determ{Bootstrap} erzeugt man sich die Verteilung von Statistiken durch Resampling aus der Stichprobe. Das hat mehrere Vorteile: diff --git a/scientificcomputing-script.tex b/scientificcomputing-script.tex index 4b7edde..325dbe8 100644 --- a/scientificcomputing-script.tex +++ b/scientificcomputing-script.tex @@ -46,7 +46,7 @@ \include{programmingstyle/lecture/programmingstyle} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\part{Grundlagen der Datenanalyse} +\part{Data analysis} \graphicspath{{statistics/lecture/}{statistics/lecture/figures/}} \lstset{inputpath=statistics/code} diff --git a/statistics/code/cumulative.m b/statistics/code/cumulative.m new file mode 100644 index 0000000..44ebbc5 --- /dev/null +++ b/statistics/code/cumulative.m @@ -0,0 +1,18 @@ +x = randn(200, 1); % generate some data +xs = sort(x); % sort the data +cdf = [1:length(x)]/length(x); % cumulative +plot(xs, cdf); +hold on; + +dx = 0.01; +xx = [-4:dx:4]; % x-values for Gaussian pdf +gauss = exp(-0.5*xx.^2)/sqrt(2.0*pi); % Gaussian pdf +gausscdf = cumsum(gauss)*dx; +plot(xx, gausscdf); +hold off; + +printf('data : probability of x<-1: %.2f\n', cdf(xs<-1.0)(end)) +printf('gauss: probability of x<-1: %.2f\n', gausscdf(xx<-1.0)(end)) +printf('\n') +printf('data : 5%% percentile at %.2f\n', xs(cdf<0.05)(end)) +printf('gauss: 5%% percentile at %.2f\n', xx(gausscdf<0.05)(end)) diff --git a/statistics/code/gaussianbins.m b/statistics/code/gaussianbins.m index 9e8c13e..3fe9987 100644 --- a/statistics/code/gaussianbins.m +++ b/statistics/code/gaussianbins.m @@ -1,11 +1,11 @@ x = randn(100, 1); % generate some data +db1 = 2; % large bin width +db2 = 0.5; % small bin width +bins1 = -4:db1:4; % large bins +bins2 = -4:db2:4; % small bins +[h1,b1] = hist(x, bins1); +[h2,b2] = hist(x, bins2); -bins1 = -4:2:4; % large bins -bins2 = -4:0.5:4; % small bins -[h1,b1] = hist(x,bins1); -[h2,b2] = hist(x,bins2); - -subplot( 1, 2, 1 ); bar(b1, h1) hold on bar(b2, h2, 'facecolor', 'r' ) diff --git a/statistics/code/gaussianbinsnorm.m b/statistics/code/gaussianbinsnorm.m index 3341bd0..8cdced2 100644 --- a/statistics/code/gaussianbinsnorm.m +++ b/statistics/code/gaussianbinsnorm.m @@ -1,9 +1,9 @@ hn1 = h1/sum(h1)/db1; hn2 = h2/sum(h2)/db2; -subplot( 1, 2, 2 ) -bar(b1,hn1) + +bar(b1, hn1) hold on -bar(b2,hn2, 'facecolor', 'r' ) +bar(b2, hn2, 'facecolor', 'r' ) xlabel('x') ylabel('Probability density') hold off diff --git a/statistics/code/gaussiankerneldensity.m b/statistics/code/gaussiankerneldensity.m new file mode 100644 index 0000000..4cb7925 --- /dev/null +++ b/statistics/code/gaussiankerneldensity.m @@ -0,0 +1,42 @@ +data = randn(100, 1); % generate some data +sigma = 0.2; % standard deviation of Gaussian kernel +xmin = -4.0; % minimum x value for kernel density +xmax = 4.0; % maximum x value for kernel density +dx = 0.05*sigma; % step size for kernel density +xg = [-4.0*sigma:dx:4.0*sigma]; % x-axis for single Gaussian kernel +% single Gaussian kernel: +kernel = exp(-0.5*(xg/sigma).^2)/sqrt(2.0*pi)/sigma; +ng = (length(kernel)-1)/2; % half the length of the Gaussian +x = [xmin:dx:xmax+0.5*dx]; % x-axis for kernel density +kd = zeros(1, length(x)); % vector for kernel density +for i = 1:length(data) % for every data value ... + xd = data(i); + % index of data value in kernel density vector: + inx = round((xd-xmin)/dx)+1; + % start index for Gaussian in kernel density vector: + k0 = inx-ng; + % end index for Gaussian in kernel density vector: + k1 = inx+ng; + g0 = 1; % start index in Gaussian + g1 = length(kernel); % end index in Gaussian + % check whether left side of Gaussian extends below xmin: + if inx < ng+1 + % adjust start indices accordingly: + k0 = 1; + g0 = ng-inx+1; + end + % check whether right side of Gaussian extends above xmax: + if inx > length(kd)-ng + % adjust end indices accordingly: + k1 = length(kd); + g1 = length(kernel)-(inx+ng-length(kd)); + end + % add Gaussian on kernel density: + kd(k0:k1) = kd(k0:k1) + kernel(g0:g1); +end +kd /= length(data); % normalize by number of data points + +% plot kernel density: +plot(x, kd) +xlabel('x') +ylabel('Probability density') diff --git a/statistics/lecture/cumulative.py b/statistics/lecture/cumulative.py new file mode 100644 index 0000000..7c74035 --- /dev/null +++ b/statistics/lecture/cumulative.py @@ -0,0 +1,52 @@ +import numpy as np +import matplotlib.pyplot as plt + +# data: +rng = np.random.RandomState(981) +data = rng.randn(100) +xs = np.sort(data) +cdf = np.arange(len(xs))/float(len(xs)) + +# Gauss: +dx = 0.01 +xx = np.arange(-4.0, 4.0, dx) +gauss = np.exp(-0.5*xx*xx)/np.sqrt(2.0*np.pi) +gausscdf = np.cumsum(gauss)*dx + +# plot: +plt.xkcd() +fig = plt.figure( figsize=(6, 2.6) ) +ax = fig.add_subplot(1, 1, 1) +ax.spines['right'].set_visible(False) +ax.spines['top'].set_visible(False) +ax.yaxis.set_ticks_position('left') +ax.xaxis.set_ticks_position('bottom') +ax.set_xlabel( 'x' ) +ax.set_xlim(-3.2, 3.2) +ax.set_xticks( np.arange( -3.0, 3.1, 1.0 ) ) +ax.set_ylabel( 'F(x)' ) +ax.set_ylim(-0.05, 1.05) +ax.set_yticks( np.arange( 0.0, 1.1, 0.2 ) ) + +med = xs[cdf>=0.5][0] +ax.plot([-3.2, med, med], [0.5, 0.5, 0.0], 'k', lw=1, zorder=-5) +ax.text(-2.8, 0.55, 'F=0.5') +ax.text(0.15, 0.25, 'median at %.2f' % med) + +q3 = xs[cdf>=0.75][0] +ax.plot([-3.2, q3, q3], [0.75, 0.75, 0.0], 'k', lw=1, zorder=-5) +ax.text(-2.8, 0.8, 'F=0.75') +ax.text(0.8, 0.5, '3. quartile at %.2f' % q3) + +p = cdf[xs>=-1.0][0] +ax.plot([-3.2, -1.0, -1.0], [p, p, 0.0], 'k', lw=1, zorder=-5) +ax.text(-2.8, 0.2, 'F=%.2f' % p) +ax.text(-0.9, 0.05, '-1') + +ax.plot(xx, gausscdf, '-', color='#0000ff', lw=2, zorder=-1) +ax.plot(xs, cdf, '-', color='#cc0000', lw=4, zorder=-1) +ax.plot([-3.2, 3.2], [1.0, 1.0], '--', color='k', lw=2, zorder=-10) + +plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.35, hspace=0.3) +fig.savefig( 'cumulative.pdf' ) +#plt.show() diff --git a/statistics/lecture/diehistograms.py b/statistics/lecture/diehistograms.py index ad53c38..622d39b 100644 --- a/statistics/lecture/diehistograms.py +++ b/statistics/lecture/diehistograms.py @@ -34,6 +34,6 @@ ax.set_ylim(0, 0.23) ax.set_ylabel( 'Probability' ) ax.plot([0.2, 6.8], [1.0/6.0, 1.0/6.0], '-b', lw=2, zorder=1) ax.hist([x2, x1], bins, normed=True, color=['#FFCC00', '#FFFF66' ], zorder=10) -plt.tight_layout() +plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.4, hspace=0.0) fig.savefig( 'diehistograms.pdf' ) #plt.show() diff --git a/statistics/lecture/kerneldensity.py b/statistics/lecture/kerneldensity.py new file mode 100644 index 0000000..6f93820 --- /dev/null +++ b/statistics/lecture/kerneldensity.py @@ -0,0 +1,83 @@ +import numpy as np +import matplotlib.pyplot as plt + +# normal distribution: +rng = np.random.RandomState(6281) +x = np.arange( -4.0, 4.0, 0.01 ) +g = np.exp(-0.5*x*x)/np.sqrt(2.0*np.pi) +r = rng.randn(100) + +def kerneldensity(data, xmin, xmax, sigma=1.0) : + dx = 0.05*sigma + xg = np.arange(-4.0*sigma, 4.0*sigma + 0.5*dx, dx) + gauss = np.exp(-0.5*xg*xg/sigma/sigma)/np.sqrt(2.0*np.pi)/sigma + ng = len(gauss)/2 + x = np.arange(xmin, xmax+0.5*dx, dx) + kd = np.zeros(len(x)) + for xd in data: + inx = int((xd-xmin)/dx) + k0 = inx-ng + k1 = inx+ng+1 + g0 = 0 + g1 = len(gauss) + if inx < ng: + k0 = 0 + g0 = ng-inx + if inx >= len(kd)-ng: + k1 = len(kd) + g1 = len(gauss)-(inx+ng-len(kd)+1) + kd[k0:k1] += gauss[g0:g1] + kd /= len(data) + return kd, x + + +plt.xkcd() + +fig = plt.figure( figsize=(6,3) ) +ax = fig.add_subplot(2, 2, 1) +ax.spines['right'].set_visible(False) +ax.spines['top'].set_visible(False) +ax.yaxis.set_ticks_position('left') +ax.xaxis.set_ticks_position('bottom') +ax.set_xlabel( 'x' ) +ax.set_xlim(-3.2, 3.2) +ax.set_xticks( np.arange( -3.0, 3.1, 1.0 ) ) +ax.set_ylabel( 'p(x)' ) +ax.set_ylim(0.0, 0.49) +ax.set_yticks( np.arange( 0.0, 0.41, 0.1 ) ) +#ax.plot(x, g, '-b', lw=2, zorder=-1) +ax.hist(r, np.arange(-4.1, 4, 0.4), normed=True, color='#FFCC00', zorder=-5) + +ax = fig.add_subplot(2, 2, 3) +ax.spines['right'].set_visible(False) +ax.spines['top'].set_visible(False) +ax.yaxis.set_ticks_position('left') +ax.xaxis.set_ticks_position('bottom') +ax.set_xlabel( 'x' ) +ax.set_xlim(-3.2, 3.2) +ax.set_xticks( np.arange( -3.0, 3.1, 1.0 ) ) +ax.set_ylabel( 'p(x)' ) +ax.set_ylim(0.0, 0.49) +ax.set_yticks( np.arange( 0.0, 0.41, 0.1 ) ) +#ax.plot(x, g, '-b', lw=2, zorder=-1) +ax.hist(r, np.arange(-4.3, 4, 0.4), normed=True, color='#FFCC00', zorder=-5) + +ax = fig.add_subplot(1, 2, 2) +ax.spines['right'].set_visible(False) +ax.spines['top'].set_visible(False) +ax.yaxis.set_ticks_position('left') +ax.xaxis.set_ticks_position('bottom') +ax.set_xlabel( 'x' ) +ax.set_xlim(-3.2, 3.2) +ax.set_xticks( np.arange( -3.0, 3.1, 1.0 ) ) +ax.set_ylabel( 'Probab. density p(x)' ) +ax.set_ylim(0.0, 0.49) +ax.set_yticks( np.arange( 0.0, 0.41, 0.1 ) ) +kd, xx = kerneldensity(r, -3.2, 3.2, 0.2) +ax.fill_between(xx, 0.0, kd, color='#FF9900', zorder=-5) +ax.plot(xx, kd, '-', lw=3, color='#CC0000', zorder=-1) + +plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.35, hspace=0.3) +fig.savefig( 'kerneldensity.pdf' ) +#plt.show() + diff --git a/statistics/lecture/pdfhistogram.py b/statistics/lecture/pdfhistogram.py index 4855217..e35606c 100644 --- a/statistics/lecture/pdfhistogram.py +++ b/statistics/lecture/pdfhistogram.py @@ -38,7 +38,7 @@ ax.plot(x, g, '-b', lw=2, zorder=-1) ax.hist(r, 5, normed=True, color='#CC0000', zorder=-10) ax.hist(r, 20, normed=True, color='#FFCC00', zorder=-5) -plt.tight_layout() +plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.4, hspace=0.0) fig.savefig( 'pdfhistogram.pdf' ) #plt.show() diff --git a/statistics/lecture/quartile.py b/statistics/lecture/quartile.py index f67133f..5d02797 100644 --- a/statistics/lecture/quartile.py +++ b/statistics/lecture/quartile.py @@ -7,7 +7,7 @@ g = np.exp(-0.5*x*x)/np.sqrt(2.0*np.pi) q = [ -0.67488, 0.0, 0.67488 ] plt.xkcd() -fig = plt.figure( figsize=(6,3.4) ) +fig = plt.figure( figsize=(6,3.2) ) ax = fig.add_subplot( 1, 1, 1 ) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) @@ -44,6 +44,7 @@ ax.plot(x,g, 'b', lw=4) ax.plot([0.0, 0.0], [0.0, 0.45], 'k', lw=2 ) ax.plot([q[0], q[0]], [0.0, 0.4], 'k', lw=2 ) ax.plot([q[2], q[2]], [0.0, 0.4], 'k', lw=2 ) -plt.tight_layout() +plt.subplots_adjust(left=0.1, right=0.98, bottom=0.15, top=0.98, wspace=0.4, hspace=0.0) +#plt.tight_layout() fig.savefig( 'quartile.pdf' ) #plt.show() diff --git a/statistics/lecture/statistics-chapter.tex b/statistics/lecture/statistics-chapter.tex index e90cfe1..8d97b6c 100644 --- a/statistics/lecture/statistics-chapter.tex +++ b/statistics/lecture/statistics-chapter.tex @@ -19,9 +19,6 @@ \section{TODO} \begin{itemize} \item Replace exercise 1.3 (boxwhisker) by one recreating figure 1. -\item Proper introduction to probabilities and densities first! -\item Cumulative propability -\item Kernel Histogramms (important for convolved PSTH)! \end{itemize} \end{document} diff --git a/statistics/lecture/statistics.tex b/statistics/lecture/statistics.tex index ca86b2e..1cdfeff 100644 --- a/statistics/lecture/statistics.tex +++ b/statistics/lecture/statistics.tex @@ -80,12 +80,12 @@ used to illustrate the standard deviation of the data \begin{figure}[t] \includegraphics[width=1\textwidth]{median} \titlecaption{\label{medianfig} Median, mean and mode of a - probability distribution.}{Left: Median, mean and mode are - identical for the symmetric and unimodal normal distribution. - Right: for asymmetric distributions these three measures differ. A - heavy tail of a distribution pulls out the mean most strongly. In - contrast, the median is more robust against heavy tails, but not - necessarily identical with the mode.} + probability distribution.}{Left: Median, mean and mode coincide + for the symmetric and unimodal normal distribution. Right: for + asymmetric distributions these three measures differ. A heavy tail + of a distribution pulls out the mean most strongly. In contrast, + the median is more robust against heavy tails, but not necessarily + identical with the mode.} \end{figure} The \enterm{mode} is the most frequent value, i.e. the position of the maximum of the probability distribution. @@ -113,7 +113,10 @@ not smaller than the median (\figref{medianfig}). \begin{figure}[t] \includegraphics[width=1\textwidth]{quartile} - \titlecaption{\label{quartilefig} Median and quartiles of a normal distribution.}{} + \titlecaption{\label{quartilefig} Median and quartiles of a normal + distribution.}{ The interquartile range between the first and the + third quartile contains 50\,\% of the data and contains the + median.} \end{figure} The distribution of data can be further characterized by the position @@ -164,7 +167,9 @@ The distribution of values in a data set is estimated by histograms $N=\sum_{i=1}^M n_i$ measurements in each of $M$ bins $i$ (\figref{diehistogramsfig} left). The bins tile the data range usually into intervals of the same size. The width of the bins is -called the bin width. +called the bin width. The frequencies $n_i$ plotted against the +categories $i$ is the \enterm{histogram}, or the \enterm{frequency + histogram}. \begin{figure}[t] \includegraphics[width=1\textwidth]{diehistograms} @@ -219,7 +224,7 @@ category $i$, i.e. of getting a data value in the $i$-th bin. \subsection{Probability densities functions} In cases where we deal with data sets of measurements of a real -quantity (e.g. the length of snakes, the weight of elephants, the time +quantity (e.g. lengths of snakes, weights of elephants, times between succeeding spikes) there is no natural bin width for computing a histogram. In addition, the probability of measuring a data value that equals exactly a specific real number like, e.g., 0.123456789 is zero, because @@ -230,7 +235,7 @@ range. For example, we can ask for the probability $P(1.2