improved on statistics

2017-11-24 19:34:16 +01:00 · 2017-11-24 19:34:16 +01:00 · f362788620
commit f362788620
parent 35d1b908f3
7 changed files with 144 additions and 44 deletions
--- a/debugging/lecture/debugging-chapter.pdf
+++ b/debugging/lecture/debugging-chapter.pdf
--- a/header.tex
+++ b/header.tex
@ -27,8 +27,9 @@
 \usepackage[makeindex]{splitidx}
 \makeindex
 \usepackage[totoc]{idxlayout}
-\newindex[Fachbegriffe]{term}
+\newindex[\tr{Glossary}{Fachbegriffe}]{term}
 \newindex[Englische Fachbegriffe]{enterm}
 \newindex[Deutsche Fachbegriffe]{determ}
 \newindex[MATLAB Code]{mcode}
 \newindex[Python Code]{pcode}
@ -198,8 +199,8 @@
 %%%%% english, german, code and file terms: %%%%%%%%%%%%%%%
 \usepackage{ifthen}
-\newcommand{\enterm}[2][]{``#2''\ifthenelse{\equal{#1}{}}{\protect\sindex[enterm]{#2}}{\protect\sindex[enterm]{#1}}}
+\newcommand{\enterm}[2][]{\tr{\textit{#2}}{``#2''}\ifthenelse{\equal{#1}{}}{\tr{\protect\sindex[term]{#2}}{\protect\sindex[enterm]{#2}}}{\tr{\protect\sindex[term]{#1}}{\protect\sindex[enterm]{#1}}}}
-\newcommand{\determ}[2][]{\textit{#2}\ifthenelse{\equal{#1}{}}{\protect\sindex[term]{#2}}{\protect\sindex[term]{#1}}}
+\newcommand{\determ}[2][]{\tr{``#2''}{\textit{#2}}\ifthenelse{\equal{#1}{}}{\tr{\protect\sindex[determ]{#2}}{\protect\sindex[term]{#2}}}{\tr{\protect\sindex[determ]{#1}}{\protect\sindex[term]{#1}}}}
 \newcommand{\codeterm}[2][]{\textit{#2}\ifthenelse{\equal{#1}{}}{\protect\sindex[term]{#2}}{\protect\sindex[term]{#1}}}
 \newcommand{\file}[1]{\texttt{#1}}
--- a/scientificcomputing-script.tex
+++ b/scientificcomputing-script.tex
@ -92,7 +92,8 @@
 %%%% indices: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \printindex[term]
-\printindex[enterm]
+\printindex[determ]    % for english text
 % \printindex[enterm]    % for german text
 %\setindexprenote{Some explanations.}
 %\printindex[pcode]
--- a/statistics/code/correlations.m
+++ b/statistics/code/correlations.m
@ -1,14 +1,18 @@
-n = 1000
+n = 200;
-x = randn( n, 1 );
+corrs = [ 1.0, 0.6, 0.0, -0.9 ];
-y = randn( n, 1 ) + 0.2*x;
+for k = [1:length(corrs)]
-r = corr(x,y)
+    r = corrs(k);
-
+    x = randn(n, 1);
-  nsamples = 500;
+    y = r*x; % linear dependence of y on x
-  rs = zeros( nsamples, 1 );
+    % add noise to destroy perfect correlations:
-for i = 1:nsamples
+    y = y + sqrt(1.0-r*r)*randn(n, 1);
-	  xs = x(randi(n,n,1));
+    % compute correlation coefficient of data:
-ys = x(randi(n,n,1));
+    rho = corr(x, y);
-	  rs(i) = corr(xs,ys);
+    subplot(2, 2, k)
    scatter( x, y )
    text( -2, 2.5, sprintf('r=%.1f', rho) )
    xlabel('x')
    ylabel('y')
    xlim([-3.0, 3.0])
    ylim([-3.0, 3.0])
 end
 hist( rs, 20 )
--- a/statistics/lecture/displayunivariatedata.py
+++ b/statistics/lecture/displayunivariatedata.py
@ -0,0 +1,54 @@
 import numpy as np
 import matplotlib.pyplot as plt
 rng = np.random.RandomState(981)
 x = rng.randn(40, 10) + 4.0
 plt.xkcd()
 fig = plt.figure( figsize=(6,3.4) )
 ax = fig.add_subplot(1, 1, 1)
 ax.spines['right'].set_visible(False)
 ax.spines['top'].set_visible(False)
 ax.yaxis.set_ticks_position('left')
 ax.xaxis.set_ticks_position('bottom')
 ax.set_xlabel('Experiment')
 ax.set_ylabel('x')
 ax.set_ylim( 0.0, 8.0)
 ax.scatter(0.5+rng.rand(len(x[:, 5])), x[:, 5], s=50)
 ax.bar([2.0], [np.mean(x[:, 5])], 1.0, yerr=[np.std(x[:, 5])],
       ecolor='k', capsize=0, error_kw={'elinewidth':5})
 ## ax.annotate('Median',
 ##             xy=(3.9, 0.0), xycoords='data',
 ##             xytext=(3.5, -2.7), textcoords='data', ha='right',
 ##             arrowprops=dict(arrowstyle="->", relpos=(0.8,1.0),
 ##             connectionstyle="angle3,angleA=-110,angleB=60") )
 ## ax.annotate('1. quartile',
 ##             xy=(5.8, -0.9), xycoords='data',
 ##             xytext=(5.5, -3.4), textcoords='data', ha='right',
 ##             arrowprops=dict(arrowstyle="->", relpos=(0.9,1.0),
 ##             connectionstyle="angle3,angleA=30,angleB=70") )
 ## ax.annotate('3. quartile',
 ##             xy=(6.1, 1.1), xycoords='data',
 ##             xytext=(6.5, 3.0), textcoords='data', ha='left',
 ##             arrowprops=dict(arrowstyle="->", relpos=(0.0,0.0),
 ##             connectionstyle="angle3,angleA=30,angleB=70") )
 ## ax.annotate('minimum',
 ##             xy=(6.1, -1.9), xycoords='data',
 ##             xytext=(7.2, -3.3), textcoords='data', ha='left',
 ##             arrowprops=dict(arrowstyle="->", relpos=(0.0,0.5),
 ##             connectionstyle="angle3,angleA=10,angleB=100") )
 ## ax.annotate('maximum',
 ##             xy=(5.9, 2.7), xycoords='data',
 ##             xytext=(4.9, 3.5), textcoords='data', ha='right',
 ##             arrowprops=dict(arrowstyle="->", relpos=(1.0,0.5),
 ##             connectionstyle="angle3,angleA=0,angleB=120") )
 #ax.boxplot( x[:, 5], positions=[4.0], whis=100.0 )
 #ax.boxplot( x[:, 5], positions=[4.0], widths=[1.0], whis=100.0, manage_xticks=False )
 ax.boxplot( x[:, 5], positions=[4.0], widths=[1.0], whis=100.0 )
 ax.set_xlim(0.0, 5.0)
 ax.set_xticks([1, 3, 5], ['a', 'b', 'c'])
 plt.tight_layout()
 plt.savefig('displayunivariatedata.pdf')
 #plt.show()
--- a/statistics/lecture/nonlincorrelation.py
+++ b/statistics/lecture/nonlincorrelation.py
@ -15,7 +15,7 @@ ax.spines['top'].set_visible(False)
 ax.yaxis.set_ticks_position('left')
 ax.xaxis.set_ticks_position('bottom')
 ax.text( 0, 4.0, 'r=%.1f' % r, ha='center' )
-ax.text( 0, 5.5, r'$y = x^2+\xi/5$', ha='center' )
+ax.text( 0, 6, r'$y = x^2+\xi/5$', ha='center' )
 ax.set_xlabel('x')
 ax.set_ylabel('y')
 ax.set_xlim( -3.0, 3.0)
@ -30,7 +30,7 @@ ax.spines['top'].set_visible(False)
 ax.yaxis.set_ticks_position('left')
 ax.xaxis.set_ticks_position('bottom')
 ax.text( 0, 1.5, 'r=%.1f' % r, ha='center' )
-ax.text( 0, 2.5, r'$y = x \cdot \xi/2$', ha='center' )
+ax.text( 0, 3, r'$y = x \cdot \xi/2$', ha='center' )
 ax.set_xlabel('x')
 ax.set_ylabel('y')
 ax.set_xlim( -3.0, 3.0)
--- a/statistics/lecture/statistics.tex
+++ b/statistics/lecture/statistics.tex
@ -5,7 +5,7 @@
 Descriptive statistics characterizes data sets by means of a few measures.
 In addition to histograms that visualize the distribution of the data,
-the following measures are used for characterizing the data:
+the following measures are used for characterizing the univariate data:
 \begin{description}
 \item[Location, central tendency] (``Lagema{\ss}e''):
  arithmetic mean, median, mode.
@ -13,10 +13,39 @@ the following measures are used for characterizing the data:
  standard deviation, inter-quartile range,\linebreak coefficient of variation
  (``Variationskoeffizient'').
 \item[Shape]: skewness (``Schiefe''), kurtosis (``W\"olbung'').
 \end{description}
 For bivariate and multivariate data sets we can also analyse their
 \begin{description}
 \item[Dependence, association] (``Zusammenhangsma{\ss}e''): Pearson's correlation coefficient,
  Spearman's rank correlation coefficient.
 \end{description}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{Mean, variance, and standard deviation}
 The \enterm{arithmetic mean} is a measure of location. For $n$ data values
 $x_i$ the arithmetic mean is computed by
 \[ \bar x = \langle x \rangle = \frac{1}{N}\sum_{i=1}^n x_i \; . \]
 The mean has the same unit as the data values.
 The dispersion of the data values around the mean is quantified by
 their \enterm{variance}
 \[ \sigma^2_x = \langle (x-\langle x \rangle)^2 \rangle = \frac{1}{N}\sum_{i=1}^n (x_i - \bar x)^2 \; . \]
 The unit of the variance is the unit of the data values squared.
 Therefore, variances cannot be compared to the mean or the data values
 themselves. In particular, variances cannot be used for plotting error
 bars along with the mean.
 The standard deviation
 \[ \sigma_x = \sqrt{\sigma^2_x} \; , \]
 however, has the same unit as the data values and can (and should) be
 used to display the dispersion of the data together withtheir mean.
 \begin{figure}[t]
  \includegraphics[width=1\textwidth]{displayunivariatedata}
  \titlecaption{\label{displayunivariatefig} Display univariate
    data.}{Bla.}
 \end{figure}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{Mode, median, quartile, etc.}
@ -44,7 +73,6 @@ not smaller than the median (\figref{medianfig}).
 \matlab{} provides the function \code{median()} for computing the median.
 \newpage
 \begin{exercise}{checkmymedian.m}{}
  Write a script that tests whether your median function really
  returns a median above which are the same number of data than
@ -242,36 +270,48 @@ $\Delta x$ der Klassen geteilt werden (\figref{pdfhistogramfig}).
 \section{Correlations}
 Until now we described properties of univariate data sets.  In
 bivariate or multivariate data sets where we have pairs or tuples of
 data values (e.g. the size and the weight of elephants) we want to analyze
 dependencies between the variables.
 The \enterm{correlation coefficient}
 \[ r_{x,y} = \frac{Cov(x,y)}{\sigma_x \sigma_y} = \frac{\langle
  (x-\langle x \rangle)(y-\langle y \rangle) \rangle}{\sqrt{\langle
    (x-\langle x \rangle)^2} \rangle \sqrt{\langle (y-\langle y
    \rangle)^2} \rangle} \] 
 quantifies linear relationships between two variables
 \matlabfun{corr()}.  The correlation coefficient is the
 \determ{covariance} normalized by the standard deviations of the
 single variables.  Perfectly correlated variables result in a
 correlation coefficient of $+1$, anit-correlated or negatively
 correlated data in a correlation coefficient of $-1$ and un-correlated
 data in a correlation coefficient close to zero
 (\figrefb{correlationfig}).
 \begin{figure}[tp]
  \includegraphics[width=1\textwidth]{correlation}
  \titlecaption{\label{correlationfig} Korrelationen zwischen Datenpaaren.}{}
 \end{figure}
-Bisher haben wir Eigenschaften einer einzelnen Me{\ss}gr\"o{\ss}e
+\begin{exercise}{correlations.m}{}
-angeschaut.  Bei mehreren Me{\ss}gr\"o{\ss}en, kann nach
+  Generate pairs of random numbers with four different correlations
-Abh\"angigkeiten zwischen den beiden Gr\"o{\ss}en gefragt werden.  Der
+  (perfectly correlated, somehow correlated, uncorrelated, negatively
-\determ[Korrelationskoeffizient]{Korrelations\-koeffizient}
+  correlated).  Plot them into a scatter plot and compute their
-\[ r_{x,y} = \frac{Cov(x,y)}{\sigma_x \sigma_y} = \frac{\langle
+  correlation coefficient.
-  (x-\langle x \rangle)(y-\langle y \rangle) \rangle}{\sqrt{\langle
+\end{exercise}
    (x-\langle x \rangle)^2} \rangle \sqrt{\langle (y-\langle y
    \rangle)^2} \rangle} \] 
 quantifiziert einfache lineare Zusammenh\"ange \matlabfun{corr()}. Der
 Korrelationskoeffizient ist die \determ{Kovarianz} normiert durch die
 Standardabweichungen.  Perfekt korrelierte Variablen ergeben einen
 Korrelationskoeffizienten von $+1$, antikorrelierte Daten einen
 Korrelationskoeffizienten von $-1$ und nicht korrelierte Daten einen
 Korrelationskoeffizienten nahe Null (\figrefb{correlationfig}).
-Nichtlineare Abh\"angigkeiten werden von dem Korrelationskoeffizienten
+Note that non-linear dependencies between two variables are
-nur unzureichend oder \"uberhaupt nicht erfasst (\figref{nonlincorrelationfig}).
+insufficiently or not at all detected by the correlation coefficient
 (\figref{nonlincorrelationfig}).
 \begin{figure}[tp]
  \includegraphics[width=1\textwidth]{nonlincorrelation}
-  \titlecaption{\label{nonlincorrelationfig} Korrelationen bei
+  \titlecaption{\label{nonlincorrelationfig} Correlations for
-    nichtlineare Zusammenh\"angen.}{Der Korrelationskoeffizienten
+    non-linear dependencies.}{The correlation coefficient detects
-    erfasst nur lineare Zusammenh\"ange. Sowohl die quadratische
+    linear dependencies only. Both the quadratic dependency (left) and
-    Abh\"angigkeit (links) als auch eine Rauschkorrelation (rechts),
+    the noise correlation (right), where the dispersal of the
-    bei der die Streuung der $y$-Werte von $x$ abh\"angen, ergeben
+    $y$-values depends on the $x$-value, result in correlation
-    Korrelationskeffizienten nahe Null. $\xi$ sind normalverteilte
+    coefficients close to zero. $\xi$ denote normally distributed
-    Zufallszahlen.}
+    random numbers.}
 \end{figure}