improved on statistics

2017-11-24 19:34:16 +01:00 · 2017-11-24 19:34:16 +01:00 · f362788620
commit f362788620
parent 35d1b908f3
7 changed files with 144 additions and 44 deletions
--- a/debugging/lecture/debugging-chapter.pdf
+++ b/debugging/lecture/debugging-chapter.pdf
--- a/header.tex
+++ b/header.tex
@ -27,8 +27,9 @@
 \usepackage[makeindex]{splitidx}
 \makeindex
 \usepackage[totoc]{idxlayout}
-\newindex[Fachbegriffe]{term}
+\newindex[\tr{Glossary}{Fachbegriffe}]{term}
 \newindex[Englische Fachbegriffe]{enterm}
+\newindex[Deutsche Fachbegriffe]{determ}
 \newindex[MATLAB Code]{mcode}
 \newindex[Python Code]{pcode}

@ -198,8 +199,8 @@

 %%%%% english, german, code and file terms: %%%%%%%%%%%%%%%
 \usepackage{ifthen}
-\newcommand{\enterm}[2][]{``#2''\ifthenelse{\equal{#1}{}}{\protect\sindex[enterm]{#2}}{\protect\sindex[enterm]{#1}}}
-\newcommand{\determ}[2][]{\textit{#2}\ifthenelse{\equal{#1}{}}{\protect\sindex[term]{#2}}{\protect\sindex[term]{#1}}}
+\newcommand{\enterm}[2][]{\tr{\textit{#2}}{``#2''}\ifthenelse{\equal{#1}{}}{\tr{\protect\sindex[term]{#2}}{\protect\sindex[enterm]{#2}}}{\tr{\protect\sindex[term]{#1}}{\protect\sindex[enterm]{#1}}}}
+\newcommand{\determ}[2][]{\tr{``#2''}{\textit{#2}}\ifthenelse{\equal{#1}{}}{\tr{\protect\sindex[determ]{#2}}{\protect\sindex[term]{#2}}}{\tr{\protect\sindex[determ]{#1}}{\protect\sindex[term]{#1}}}}
 \newcommand{\codeterm}[2][]{\textit{#2}\ifthenelse{\equal{#1}{}}{\protect\sindex[term]{#2}}{\protect\sindex[term]{#1}}}
 \newcommand{\file}[1]{\texttt{#1}}

--- a/scientificcomputing-script.tex
+++ b/scientificcomputing-script.tex
@ -92,7 +92,8 @@
 %%%% indices: %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \printindex[term]

-\printindex[enterm]
+\printindex[determ]    % for english text
+% \printindex[enterm]    % for german text

 %\setindexprenote{Some explanations.}
 %\printindex[pcode]
--- a/statistics/code/correlations.m
+++ b/statistics/code/correlations.m
@ -1,14 +1,18 @@
-n = 1000
-x = randn( n, 1 );
-y = randn( n, 1 ) + 0.2*x;
-r = corr(x,y)
-
-  nsamples = 500;
-  rs = zeros( nsamples, 1 );
-for i = 1:nsamples
-	  xs = x(randi(n,n,1));
-ys = x(randi(n,n,1));
-	  rs(i) = corr(xs,ys);
+n = 200;
+corrs = [ 1.0, 0.6, 0.0, -0.9 ];
+for k = [1:length(corrs)]
+    r = corrs(k);
+    x = randn(n, 1);
+    y = r*x; % linear dependence of y on x
+    % add noise to destroy perfect correlations:
+    y = y + sqrt(1.0-r*r)*randn(n, 1);
+    % compute correlation coefficient of data:
+    rho = corr(x, y);
+    subplot(2, 2, k)
+    scatter( x, y )
+    text( -2, 2.5, sprintf('r=%.1f', rho) )
+    xlabel('x')
+    ylabel('y')
+    xlim([-3.0, 3.0])
+    ylim([-3.0, 3.0])
 end
-
-hist( rs, 20 )
--- a/statistics/lecture/displayunivariatedata.py
+++ b/statistics/lecture/displayunivariatedata.py
@ -0,0 +1,54 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+rng = np.random.RandomState(981)
+x = rng.randn(40, 10) + 4.0
+
+plt.xkcd()
+fig = plt.figure( figsize=(6,3.4) )
+ax = fig.add_subplot(1, 1, 1)
+ax.spines['right'].set_visible(False)
+ax.spines['top'].set_visible(False)
+ax.yaxis.set_ticks_position('left')
+ax.xaxis.set_ticks_position('bottom')
+ax.set_xlabel('Experiment')
+ax.set_ylabel('x')
+ax.set_ylim( 0.0, 8.0)
+ax.scatter(0.5+rng.rand(len(x[:, 5])), x[:, 5], s=50)
+ax.bar([2.0], [np.mean(x[:, 5])], 1.0, yerr=[np.std(x[:, 5])],
+       ecolor='k', capsize=0, error_kw={'elinewidth':5})
+
+## ax.annotate('Median',
+##             xy=(3.9, 0.0), xycoords='data',
+##             xytext=(3.5, -2.7), textcoords='data', ha='right',
+##             arrowprops=dict(arrowstyle="->", relpos=(0.8,1.0),
+##             connectionstyle="angle3,angleA=-110,angleB=60") )
+## ax.annotate('1. quartile',
+##             xy=(5.8, -0.9), xycoords='data',
+##             xytext=(5.5, -3.4), textcoords='data', ha='right',
+##             arrowprops=dict(arrowstyle="->", relpos=(0.9,1.0),
+##             connectionstyle="angle3,angleA=30,angleB=70") )
+## ax.annotate('3. quartile',
+##             xy=(6.1, 1.1), xycoords='data',
+##             xytext=(6.5, 3.0), textcoords='data', ha='left',
+##             arrowprops=dict(arrowstyle="->", relpos=(0.0,0.0),
+##             connectionstyle="angle3,angleA=30,angleB=70") )
+## ax.annotate('minimum',
+##             xy=(6.1, -1.9), xycoords='data',
+##             xytext=(7.2, -3.3), textcoords='data', ha='left',
+##             arrowprops=dict(arrowstyle="->", relpos=(0.0,0.5),
+##             connectionstyle="angle3,angleA=10,angleB=100") )
+## ax.annotate('maximum',
+##             xy=(5.9, 2.7), xycoords='data',
+##             xytext=(4.9, 3.5), textcoords='data', ha='right',
+##             arrowprops=dict(arrowstyle="->", relpos=(1.0,0.5),
+##             connectionstyle="angle3,angleA=0,angleB=120") )
+#ax.boxplot( x[:, 5], positions=[4.0], whis=100.0 )
+#ax.boxplot( x[:, 5], positions=[4.0], widths=[1.0], whis=100.0, manage_xticks=False )
+ax.boxplot( x[:, 5], positions=[4.0], widths=[1.0], whis=100.0 )
+ax.set_xlim(0.0, 5.0)
+ax.set_xticks([1, 3, 5], ['a', 'b', 'c'])
+plt.tight_layout()
+plt.savefig('displayunivariatedata.pdf')
+#plt.show()
+
--- a/statistics/lecture/nonlincorrelation.py
+++ b/statistics/lecture/nonlincorrelation.py
@ -15,7 +15,7 @@ ax.spines['top'].set_visible(False)
 ax.yaxis.set_ticks_position('left')
 ax.xaxis.set_ticks_position('bottom')
 ax.text( 0, 4.0, 'r=%.1f' % r, ha='center' )
-ax.text( 0, 5.5, r'$y = x^2+\xi/5$', ha='center' )
+ax.text( 0, 6, r'$y = x^2+\xi/5$', ha='center' )
 ax.set_xlabel('x')
 ax.set_ylabel('y')
 ax.set_xlim( -3.0, 3.0)
@ -30,7 +30,7 @@ ax.spines['top'].set_visible(False)
 ax.yaxis.set_ticks_position('left')
 ax.xaxis.set_ticks_position('bottom')
 ax.text( 0, 1.5, 'r=%.1f' % r, ha='center' )
-ax.text( 0, 2.5, r'$y = x \cdot \xi/2$', ha='center' )
+ax.text( 0, 3, r'$y = x \cdot \xi/2$', ha='center' )
 ax.set_xlabel('x')
 ax.set_ylabel('y')
 ax.set_xlim( -3.0, 3.0)
--- a/statistics/lecture/statistics.tex
+++ b/statistics/lecture/statistics.tex
@ -5,7 +5,7 @@
 Descriptive statistics characterizes data sets by means of a few measures.

 In addition to histograms that visualize the distribution of the data,
-the following measures are used for characterizing the data:
+the following measures are used for characterizing the univariate data:
 \begin{description}
 \item[Location, central tendency] (``Lagema{\ss}e''):
  arithmetic mean, median, mode.
@ -13,10 +13,39 @@ the following measures are used for characterizing the data:
  standard deviation, inter-quartile range,\linebreak coefficient of variation
  (``Variationskoeffizient'').
 \item[Shape]: skewness (``Schiefe''), kurtosis (``W\"olbung'').
+\end{description}
+For bivariate and multivariate data sets we can also analyse their
+\begin{description}
 \item[Dependence, association] (``Zusammenhangsma{\ss}e''): Pearson's correlation coefficient,
  Spearman's rank correlation coefficient.
 \end{description}

+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\section{Mean, variance, and standard deviation}
+The \enterm{arithmetic mean} is a measure of location. For $n$ data values
+$x_i$ the arithmetic mean is computed by
+\[ \bar x = \langle x \rangle = \frac{1}{N}\sum_{i=1}^n x_i \; . \]
+The mean has the same unit as the data values.
+
+The dispersion of the data values around the mean is quantified by
+their \enterm{variance}
+\[ \sigma^2_x = \langle (x-\langle x \rangle)^2 \rangle = \frac{1}{N}\sum_{i=1}^n (x_i - \bar x)^2 \; . \]
+The unit of the variance is the unit of the data values squared.
+Therefore, variances cannot be compared to the mean or the data values
+themselves. In particular, variances cannot be used for plotting error
+bars along with the mean.
+
+The standard deviation
+\[ \sigma_x = \sqrt{\sigma^2_x} \; , \]
+however, has the same unit as the data values and can (and should) be
+used to display the dispersion of the data together withtheir mean.
+
+\begin{figure}[t]
+  \includegraphics[width=1\textwidth]{displayunivariatedata}
+  \titlecaption{\label{displayunivariatefig} Display univariate
+    data.}{Bla.}
+\end{figure}
+
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{Mode, median, quartile, etc.}

@ -44,7 +73,6 @@ not smaller than the median (\figref{medianfig}).

 \matlab{} provides the function \code{median()} for computing the median.

-\newpage
 \begin{exercise}{checkmymedian.m}{}
  Write a script that tests whether your median function really
  returns a median above which are the same number of data than
@ -242,36 +270,48 @@ $\Delta x$ der Klassen geteilt werden (\figref{pdfhistogramfig}).

 \section{Correlations}

-\begin{figure}[tp]
-  \includegraphics[width=1\textwidth]{correlation}
-  \titlecaption{\label{correlationfig} Korrelationen zwischen Datenpaaren.}{}
-\end{figure}
+Until now we described properties of univariate data sets.  In
+bivariate or multivariate data sets where we have pairs or tuples of
+data values (e.g. the size and the weight of elephants) we want to analyze
+dependencies between the variables.

-Bisher haben wir Eigenschaften einer einzelnen Me{\ss}gr\"o{\ss}e
-angeschaut.  Bei mehreren Me{\ss}gr\"o{\ss}en, kann nach
-Abh\"angigkeiten zwischen den beiden Gr\"o{\ss}en gefragt werden.  Der
-\determ[Korrelationskoeffizient]{Korrelations\-koeffizient}
+The \enterm{correlation coefficient}
 \[ r_{x,y} = \frac{Cov(x,y)}{\sigma_x \sigma_y} = \frac{\langle
  (x-\langle x \rangle)(y-\langle y \rangle) \rangle}{\sqrt{\langle
    (x-\langle x \rangle)^2} \rangle \sqrt{\langle (y-\langle y
    \rangle)^2} \rangle} \] 
-quantifiziert einfache lineare Zusammenh\"ange \matlabfun{corr()}. Der
-Korrelationskoeffizient ist die \determ{Kovarianz} normiert durch die
-Standardabweichungen.  Perfekt korrelierte Variablen ergeben einen
-Korrelationskoeffizienten von $+1$, antikorrelierte Daten einen
-Korrelationskoeffizienten von $-1$ und nicht korrelierte Daten einen
-Korrelationskoeffizienten nahe Null (\figrefb{correlationfig}).
+quantifies linear relationships between two variables
+\matlabfun{corr()}.  The correlation coefficient is the
+\determ{covariance} normalized by the standard deviations of the
+single variables.  Perfectly correlated variables result in a
+correlation coefficient of $+1$, anit-correlated or negatively
+correlated data in a correlation coefficient of $-1$ and un-correlated
+data in a correlation coefficient close to zero
+(\figrefb{correlationfig}).
+
+\begin{figure}[tp]
+  \includegraphics[width=1\textwidth]{correlation}
+  \titlecaption{\label{correlationfig} Korrelationen zwischen Datenpaaren.}{}
+\end{figure}
+
+\begin{exercise}{correlations.m}{}
+  Generate pairs of random numbers with four different correlations
+  (perfectly correlated, somehow correlated, uncorrelated, negatively
+  correlated).  Plot them into a scatter plot and compute their
+  correlation coefficient.
+\end{exercise}

-Nichtlineare Abh\"angigkeiten werden von dem Korrelationskoeffizienten
-nur unzureichend oder \"uberhaupt nicht erfasst (\figref{nonlincorrelationfig}).
+Note that non-linear dependencies between two variables are
+insufficiently or not at all detected by the correlation coefficient
+(\figref{nonlincorrelationfig}).

 \begin{figure}[tp]
  \includegraphics[width=1\textwidth]{nonlincorrelation}
-  \titlecaption{\label{nonlincorrelationfig} Korrelationen bei
-    nichtlineare Zusammenh\"angen.}{Der Korrelationskoeffizienten
-    erfasst nur lineare Zusammenh\"ange. Sowohl die quadratische
-    Abh\"angigkeit (links) als auch eine Rauschkorrelation (rechts),
-    bei der die Streuung der $y$-Werte von $x$ abh\"angen, ergeben
-    Korrelationskeffizienten nahe Null. $\xi$ sind normalverteilte
-    Zufallszahlen.}
+  \titlecaption{\label{nonlincorrelationfig} Correlations for
+    non-linear dependencies.}{The correlation coefficient detects
+    linear dependencies only. Both the quadratic dependency (left) and
+    the noise correlation (right), where the dispersal of the
+    $y$-values depends on the $x$-value, result in correlation
+    coefficients close to zero. $\xi$ denote normally distributed
+    random numbers.}
 \end{figure}