diff --git a/debugging/lecture/debugging-chapter.pdf b/debugging/lecture/debugging-chapter.pdf index 5faa385..d0729e3 100644 Binary files a/debugging/lecture/debugging-chapter.pdf and b/debugging/lecture/debugging-chapter.pdf differ diff --git a/header.tex b/header.tex index 34627cc..4231e0c 100644 --- a/header.tex +++ b/header.tex @@ -27,8 +27,9 @@ \usepackage[makeindex]{splitidx} \makeindex \usepackage[totoc]{idxlayout} -\newindex[Fachbegriffe]{term} +\newindex[\tr{Glossary}{Fachbegriffe}]{term} \newindex[Englische Fachbegriffe]{enterm} +\newindex[Deutsche Fachbegriffe]{determ} \newindex[MATLAB Code]{mcode} \newindex[Python Code]{pcode} @@ -198,8 +199,8 @@ %%%%% english, german, code and file terms: %%%%%%%%%%%%%%% \usepackage{ifthen} -\newcommand{\enterm}[2][]{``#2''\ifthenelse{\equal{#1}{}}{\protect\sindex[enterm]{#2}}{\protect\sindex[enterm]{#1}}} -\newcommand{\determ}[2][]{\textit{#2}\ifthenelse{\equal{#1}{}}{\protect\sindex[term]{#2}}{\protect\sindex[term]{#1}}} +\newcommand{\enterm}[2][]{\tr{\textit{#2}}{``#2''}\ifthenelse{\equal{#1}{}}{\tr{\protect\sindex[term]{#2}}{\protect\sindex[enterm]{#2}}}{\tr{\protect\sindex[term]{#1}}{\protect\sindex[enterm]{#1}}}} +\newcommand{\determ}[2][]{\tr{``#2''}{\textit{#2}}\ifthenelse{\equal{#1}{}}{\tr{\protect\sindex[determ]{#2}}{\protect\sindex[term]{#2}}}{\tr{\protect\sindex[determ]{#1}}{\protect\sindex[term]{#1}}}} \newcommand{\codeterm}[2][]{\textit{#2}\ifthenelse{\equal{#1}{}}{\protect\sindex[term]{#2}}{\protect\sindex[term]{#1}}} \newcommand{\file}[1]{\texttt{#1}} diff --git a/scientificcomputing-script.tex b/scientificcomputing-script.tex index 705a410..4b7edde 100644 --- a/scientificcomputing-script.tex +++ b/scientificcomputing-script.tex @@ -92,7 +92,8 @@ %%%% indices: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \printindex[term] -\printindex[enterm] +\printindex[determ] % for english text +% \printindex[enterm] % for german text %\setindexprenote{Some explanations.} %\printindex[pcode] diff --git a/statistics/code/correlations.m b/statistics/code/correlations.m index 7816457..d328592 100644 --- a/statistics/code/correlations.m +++ b/statistics/code/correlations.m @@ -1,14 +1,18 @@ -n = 1000 -x = randn( n, 1 ); -y = randn( n, 1 ) + 0.2*x; -r = corr(x,y) - - nsamples = 500; - rs = zeros( nsamples, 1 ); -for i = 1:nsamples - xs = x(randi(n,n,1)); -ys = x(randi(n,n,1)); - rs(i) = corr(xs,ys); +n = 200; +corrs = [ 1.0, 0.6, 0.0, -0.9 ]; +for k = [1:length(corrs)] + r = corrs(k); + x = randn(n, 1); + y = r*x; % linear dependence of y on x + % add noise to destroy perfect correlations: + y = y + sqrt(1.0-r*r)*randn(n, 1); + % compute correlation coefficient of data: + rho = corr(x, y); + subplot(2, 2, k) + scatter( x, y ) + text( -2, 2.5, sprintf('r=%.1f', rho) ) + xlabel('x') + ylabel('y') + xlim([-3.0, 3.0]) + ylim([-3.0, 3.0]) end - -hist( rs, 20 ) diff --git a/statistics/lecture/displayunivariatedata.py b/statistics/lecture/displayunivariatedata.py new file mode 100644 index 0000000..ac62b88 --- /dev/null +++ b/statistics/lecture/displayunivariatedata.py @@ -0,0 +1,54 @@ +import numpy as np +import matplotlib.pyplot as plt + +rng = np.random.RandomState(981) +x = rng.randn(40, 10) + 4.0 + +plt.xkcd() +fig = plt.figure( figsize=(6,3.4) ) +ax = fig.add_subplot(1, 1, 1) +ax.spines['right'].set_visible(False) +ax.spines['top'].set_visible(False) +ax.yaxis.set_ticks_position('left') +ax.xaxis.set_ticks_position('bottom') +ax.set_xlabel('Experiment') +ax.set_ylabel('x') +ax.set_ylim( 0.0, 8.0) +ax.scatter(0.5+rng.rand(len(x[:, 5])), x[:, 5], s=50) +ax.bar([2.0], [np.mean(x[:, 5])], 1.0, yerr=[np.std(x[:, 5])], + ecolor='k', capsize=0, error_kw={'elinewidth':5}) + +## ax.annotate('Median', +## xy=(3.9, 0.0), xycoords='data', +## xytext=(3.5, -2.7), textcoords='data', ha='right', +## arrowprops=dict(arrowstyle="->", relpos=(0.8,1.0), +## connectionstyle="angle3,angleA=-110,angleB=60") ) +## ax.annotate('1. quartile', +## xy=(5.8, -0.9), xycoords='data', +## xytext=(5.5, -3.4), textcoords='data', ha='right', +## arrowprops=dict(arrowstyle="->", relpos=(0.9,1.0), +## connectionstyle="angle3,angleA=30,angleB=70") ) +## ax.annotate('3. quartile', +## xy=(6.1, 1.1), xycoords='data', +## xytext=(6.5, 3.0), textcoords='data', ha='left', +## arrowprops=dict(arrowstyle="->", relpos=(0.0,0.0), +## connectionstyle="angle3,angleA=30,angleB=70") ) +## ax.annotate('minimum', +## xy=(6.1, -1.9), xycoords='data', +## xytext=(7.2, -3.3), textcoords='data', ha='left', +## arrowprops=dict(arrowstyle="->", relpos=(0.0,0.5), +## connectionstyle="angle3,angleA=10,angleB=100") ) +## ax.annotate('maximum', +## xy=(5.9, 2.7), xycoords='data', +## xytext=(4.9, 3.5), textcoords='data', ha='right', +## arrowprops=dict(arrowstyle="->", relpos=(1.0,0.5), +## connectionstyle="angle3,angleA=0,angleB=120") ) +#ax.boxplot( x[:, 5], positions=[4.0], whis=100.0 ) +#ax.boxplot( x[:, 5], positions=[4.0], widths=[1.0], whis=100.0, manage_xticks=False ) +ax.boxplot( x[:, 5], positions=[4.0], widths=[1.0], whis=100.0 ) +ax.set_xlim(0.0, 5.0) +ax.set_xticks([1, 3, 5], ['a', 'b', 'c']) +plt.tight_layout() +plt.savefig('displayunivariatedata.pdf') +#plt.show() + diff --git a/statistics/lecture/nonlincorrelation.py b/statistics/lecture/nonlincorrelation.py index 5b81e8d..e346826 100644 --- a/statistics/lecture/nonlincorrelation.py +++ b/statistics/lecture/nonlincorrelation.py @@ -15,7 +15,7 @@ ax.spines['top'].set_visible(False) ax.yaxis.set_ticks_position('left') ax.xaxis.set_ticks_position('bottom') ax.text( 0, 4.0, 'r=%.1f' % r, ha='center' ) -ax.text( 0, 5.5, r'$y = x^2+\xi/5$', ha='center' ) +ax.text( 0, 6, r'$y = x^2+\xi/5$', ha='center' ) ax.set_xlabel('x') ax.set_ylabel('y') ax.set_xlim( -3.0, 3.0) @@ -30,7 +30,7 @@ ax.spines['top'].set_visible(False) ax.yaxis.set_ticks_position('left') ax.xaxis.set_ticks_position('bottom') ax.text( 0, 1.5, 'r=%.1f' % r, ha='center' ) -ax.text( 0, 2.5, r'$y = x \cdot \xi/2$', ha='center' ) +ax.text( 0, 3, r'$y = x \cdot \xi/2$', ha='center' ) ax.set_xlabel('x') ax.set_ylabel('y') ax.set_xlim( -3.0, 3.0) diff --git a/statistics/lecture/statistics.tex b/statistics/lecture/statistics.tex index 881903c..8019fc9 100644 --- a/statistics/lecture/statistics.tex +++ b/statistics/lecture/statistics.tex @@ -5,7 +5,7 @@ Descriptive statistics characterizes data sets by means of a few measures. In addition to histograms that visualize the distribution of the data, -the following measures are used for characterizing the data: +the following measures are used for characterizing the univariate data: \begin{description} \item[Location, central tendency] (``Lagema{\ss}e''): arithmetic mean, median, mode. @@ -13,10 +13,39 @@ the following measures are used for characterizing the data: standard deviation, inter-quartile range,\linebreak coefficient of variation (``Variationskoeffizient''). \item[Shape]: skewness (``Schiefe''), kurtosis (``W\"olbung''). +\end{description} +For bivariate and multivariate data sets we can also analyse their +\begin{description} \item[Dependence, association] (``Zusammenhangsma{\ss}e''): Pearson's correlation coefficient, Spearman's rank correlation coefficient. \end{description} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Mean, variance, and standard deviation} +The \enterm{arithmetic mean} is a measure of location. For $n$ data values +$x_i$ the arithmetic mean is computed by +\[ \bar x = \langle x \rangle = \frac{1}{N}\sum_{i=1}^n x_i \; . \] +The mean has the same unit as the data values. + +The dispersion of the data values around the mean is quantified by +their \enterm{variance} +\[ \sigma^2_x = \langle (x-\langle x \rangle)^2 \rangle = \frac{1}{N}\sum_{i=1}^n (x_i - \bar x)^2 \; . \] +The unit of the variance is the unit of the data values squared. +Therefore, variances cannot be compared to the mean or the data values +themselves. In particular, variances cannot be used for plotting error +bars along with the mean. + +The standard deviation +\[ \sigma_x = \sqrt{\sigma^2_x} \; , \] +however, has the same unit as the data values and can (and should) be +used to display the dispersion of the data together withtheir mean. + +\begin{figure}[t] + \includegraphics[width=1\textwidth]{displayunivariatedata} + \titlecaption{\label{displayunivariatefig} Display univariate + data.}{Bla.} +\end{figure} + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Mode, median, quartile, etc.} @@ -44,7 +73,6 @@ not smaller than the median (\figref{medianfig}). \matlab{} provides the function \code{median()} for computing the median. -\newpage \begin{exercise}{checkmymedian.m}{} Write a script that tests whether your median function really returns a median above which are the same number of data than @@ -242,36 +270,48 @@ $\Delta x$ der Klassen geteilt werden (\figref{pdfhistogramfig}). \section{Correlations} -\begin{figure}[tp] - \includegraphics[width=1\textwidth]{correlation} - \titlecaption{\label{correlationfig} Korrelationen zwischen Datenpaaren.}{} -\end{figure} +Until now we described properties of univariate data sets. In +bivariate or multivariate data sets where we have pairs or tuples of +data values (e.g. the size and the weight of elephants) we want to analyze +dependencies between the variables. -Bisher haben wir Eigenschaften einer einzelnen Me{\ss}gr\"o{\ss}e -angeschaut. Bei mehreren Me{\ss}gr\"o{\ss}en, kann nach -Abh\"angigkeiten zwischen den beiden Gr\"o{\ss}en gefragt werden. Der -\determ[Korrelationskoeffizient]{Korrelations\-koeffizient} +The \enterm{correlation coefficient} \[ r_{x,y} = \frac{Cov(x,y)}{\sigma_x \sigma_y} = \frac{\langle (x-\langle x \rangle)(y-\langle y \rangle) \rangle}{\sqrt{\langle (x-\langle x \rangle)^2} \rangle \sqrt{\langle (y-\langle y \rangle)^2} \rangle} \] -quantifiziert einfache lineare Zusammenh\"ange \matlabfun{corr()}. Der -Korrelationskoeffizient ist die \determ{Kovarianz} normiert durch die -Standardabweichungen. Perfekt korrelierte Variablen ergeben einen -Korrelationskoeffizienten von $+1$, antikorrelierte Daten einen -Korrelationskoeffizienten von $-1$ und nicht korrelierte Daten einen -Korrelationskoeffizienten nahe Null (\figrefb{correlationfig}). +quantifies linear relationships between two variables +\matlabfun{corr()}. The correlation coefficient is the +\determ{covariance} normalized by the standard deviations of the +single variables. Perfectly correlated variables result in a +correlation coefficient of $+1$, anit-correlated or negatively +correlated data in a correlation coefficient of $-1$ and un-correlated +data in a correlation coefficient close to zero +(\figrefb{correlationfig}). + +\begin{figure}[tp] + \includegraphics[width=1\textwidth]{correlation} + \titlecaption{\label{correlationfig} Korrelationen zwischen Datenpaaren.}{} +\end{figure} + +\begin{exercise}{correlations.m}{} + Generate pairs of random numbers with four different correlations + (perfectly correlated, somehow correlated, uncorrelated, negatively + correlated). Plot them into a scatter plot and compute their + correlation coefficient. +\end{exercise} -Nichtlineare Abh\"angigkeiten werden von dem Korrelationskoeffizienten -nur unzureichend oder \"uberhaupt nicht erfasst (\figref{nonlincorrelationfig}). +Note that non-linear dependencies between two variables are +insufficiently or not at all detected by the correlation coefficient +(\figref{nonlincorrelationfig}). \begin{figure}[tp] \includegraphics[width=1\textwidth]{nonlincorrelation} - \titlecaption{\label{nonlincorrelationfig} Korrelationen bei - nichtlineare Zusammenh\"angen.}{Der Korrelationskoeffizienten - erfasst nur lineare Zusammenh\"ange. Sowohl die quadratische - Abh\"angigkeit (links) als auch eine Rauschkorrelation (rechts), - bei der die Streuung der $y$-Werte von $x$ abh\"angen, ergeben - Korrelationskeffizienten nahe Null. $\xi$ sind normalverteilte - Zufallszahlen.} + \titlecaption{\label{nonlincorrelationfig} Correlations for + non-linear dependencies.}{The correlation coefficient detects + linear dependencies only. Both the quadratic dependency (left) and + the noise correlation (right), where the dispersal of the + $y$-values depends on the $x$-value, result in correlation + coefficients close to zero. $\xi$ denote normally distributed + random numbers.} \end{figure}