scientificComputing/statistics/lecture/statistics.tex

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{Descriptive statistics}

Descriptive statistics characterizes data sets by means of a few measures.

In addition to histograms that visualize the distribution of the data,
the following measures are used for characterizing the univariate data:
\begin{description}
\item[Location, central tendency] (``Lagema{\ss}e''):
  arithmetic mean, median, mode.
\item[Spread, dispersion] (``Streuungsma{\ss}e''): variance,
  standard deviation, inter-quartile range,\linebreak coefficient of variation
  (``Variationskoeffizient'').
\item[Shape]: skewness (``Schiefe''), kurtosis (``W\"olbung'').
\end{description}
For bivariate and multivariate data sets we can also analyse their
\begin{description}
\item[Dependence, association] (``Zusammenhangsma{\ss}e''): Pearson's correlation coefficient,
  Spearman's rank correlation coefficient.
\end{description}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Mean, variance, and standard deviation}
The \enterm{arithmetic mean} is a measure of location. For $n$ data values
$x_i$ the arithmetic mean is computed by
\[ \bar x = \langle x \rangle = \frac{1}{N}\sum_{i=1}^n x_i \; . \]
The mean has the same unit as the data values.

The dispersion of the data values around the mean is quantified by
their \enterm{variance}
\[ \sigma^2_x = \langle (x-\langle x \rangle)^2 \rangle = \frac{1}{N}\sum_{i=1}^n (x_i - \bar x)^2 \; . \]
The unit of the variance is the unit of the data values squared.
Therefore, variances cannot be compared to the mean or the data values
themselves. In particular, variances cannot be used for plotting error
bars along with the mean.

The standard deviation
\[ \sigma_x = \sqrt{\sigma^2_x} \; , \]
however, has the same unit as the data values and can (and should) be
used to display the dispersion of the data together withtheir mean.

\begin{figure}[t]
  \includegraphics[width=1\textwidth]{displayunivariatedata}
  \titlecaption{\label{displayunivariatefig} Display univariate
    data.}{Bla.}
\end{figure}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Mode, median, quartile, etc.}

\begin{figure}[t]
  \includegraphics[width=1\textwidth]{median}
  \titlecaption{\label{medianfig} Median, mean and mode of a
    probability distribution.}{Left: Median, mean and mode are
    identical for the symmetric and unimodal normal distribution.
    Right: for asymmetric distributions these threa measures differ. A
    heavy tail of a distribution pulls out the mean most strongly. In
    contrast, the median is more robust against heavy tails, but not
    necessarily identical with the mode.}
\end{figure}

The \enterm{mode} is the most frequent value, i.e. the position of the maximum of the probability distribution.

The \enterm{median} separates a list of data values into two halves
such that one half of the data is not greater and the other half is
not smaller than the median (\figref{medianfig}).

\newpage
\begin{exercise}{mymedian.m}{}
  Write a function \code{mymedian()} that computes the median of a vector.
\end{exercise}

\matlab{} provides the function \code{median()} for computing the median.

\begin{exercise}{checkmymedian.m}{}
  Write a script that tests whether your median function really
  returns a median above which are the same number of data than
  below. In particular the script should test data vectors of
  different length.
\end{exercise}

\begin{figure}[t]
  \includegraphics[width=1\textwidth]{quartile}
  \titlecaption{\label{quartilefig} Median and quartiles of a normal distribution.}{}
\end{figure}

The distribution of data can be further characterized by the position
of its \enterm[quartile]{quartiles}. Neighboring quartiles are
separated by 25\,\% of the data (\figref{quartilefig}).
\enterm[percentile]{Percentiles} allow to characterize the
distribution of the data in more detail. The 3$^{\rm rd}$ quartile
corresponds to the 75$^{\rm th}$ percentile, because 75\,\% of the
data are smaller than the 3$^{\rm rd}$ quartile.

% \begin{definition}[quartile]
%   Die Quartile Q1, Q2 und Q3 unterteilen die Daten in vier gleich
%   gro{\ss}e Gruppen, die jeweils ein Viertel der Daten enthalten.
%   Das mittlere Quartil entspricht dem Median.
% \end{definition}

% \begin{exercise}{quartiles.m}{}
%   Write a function that computes the first, second, and third quartile of a vector.
% \end{exercise}

\begin{figure}[t]
  \includegraphics[width=1\textwidth]{boxwhisker}
  \titlecaption{\label{boxwhiskerfig} Box-Whisker Plot.}{Box-whisker
    plots are well suited for comparing unimodal distributions.  Each
    box-whisker characterizes 40 random numbers that have been drawn
    from a normal distribution.}
\end{figure}

\enterm{Box-whisker plots} are commonly used to visualize and compare
the distribution of unimodal data. Aa box is drawn around the median
that extends from the 1$^{\rm st}$ to the 3$^{\rm rd}$ quartile. The
whiskers mark the minimum and maximum value of the data set
(\figref{boxwhiskerfig}).

\begin{exercise}{boxwhisker.m}{}
  Generate eine $40 \times 10$ matrix of random numbers and
  illustrate their distribution in a box-whicker plot
  (\code{boxplot()} function). How to interpret the plot?
\end{exercise}

\section{Histograms}

\enterm[Histogram]{Histograms} count the frequency $n_i$ of
$N=\sum_{i=1}^M n_i$ measurements in $M$ bins $i$.  The bins tile the
data range usually into intervals of the same size. Histograms are
often used to estimate the \enterm{probability distribution} of the
data values.

\begin{figure}[t]
  \includegraphics[width=1\textwidth]{diehistograms}
  \titlecaption{\label{diehistogramsfig} Histograms resulting from 100
    or 500 times rolling a die.}{Left: the absolute frequency
    histogram counts the frequency of each number the die
    shows. Right: When normalized by the sum of the frequency
    histogram the two data sets become comparable with each other and
    with the expected theoretical distribution of $P=1/6$.}
\end{figure}

For integer data values (e.g. die number of the faces of a die or the
number of action potential occurring within a fixed time window) a bin
can be defined for each data value.  The histogram is usually
normalized by the total number of measurements to make it
independent of size of the data set (\figref{diehistogramsfig}). Then
the height of each histogram bar equals the probability $P(x_i)$ of
the data value $x_i$ in the $i$-th bin:
\[ P(x_i) = P_i = \frac{n_i}{N} = \frac{n_i}{\sum_{i=1}^M n_i} \; . \]

\begin{exercise}{rollthedie.m}{}
  Write a function that simulates rolling a die $n$ times.
\end{exercise}

\begin{exercise}{diehistograms.m}{}
  Plotte Histogramme von 20, 100, und 1000-mal W\"urfeln.  Benutze
  \code[hist()]{hist(x)}, erzwinge sechs Bins mit
  \code[hist()]{hist(x,6)}, oder setze selbst sinnvolle Bins. Normiere
  anschliessend das Histogram.
\end{exercise}


\section{Probability density functions}

Meistens haben wir es jedoch mit reellen Messgr\"o{\ss}en zu tun
(z.B. Gewicht von Tigern, L\"ange von Interspikeintervallen).  Es
macht keinen Sinn dem Auftreten jeder einzelnen reelen Zahl eine
Wahrscheinlichkeit zuzuordnen, denn die Wahrscheinlichkeit genau den
Wert einer bestimmten reelen Zahl, z.B. 1.23456789, zu messen ist
gleich Null, da es unabz\"ahlbar viele reelle Zahlen gibt.

Sinnvoller ist es dagegen, nach der Wahrscheinlichkeit zu fragen, eine
Zahl aus einem bestimmten Bereich zu erhalten, z.B. die
Wahrscheinlichkeit $P(1.2<x<1.3)$, dass die Zahl $x$ einen Wert
zwischen 1.2 und 1.3 hat.

Im Grenzwert zu sehr kleinen Bereichen $\Delta x$ ist die Wahrscheinlichkeit
eines Wertes $x$ zwischen $x_0$ und $x_0+\Delta x$
\[ P(x_0<x<x_0+\Delta x) \approx p(x) \cdot \Delta x \; . \]
Die Gr\"o{\ss}e $p(x)$ ist eine sogenannte
\determ{Wahrscheinlichkeitsdichte}. Sie ist keine einheitenlose
Wahrscheinlichkeit mit Werten zwischen Null und Eins, sondern kann
jeden positiven Wert annehmen und hat als Einheit den Kehrwert der
Einheit von $x$.

\begin{figure}[t]
  \includegraphics[width=1\textwidth]{pdfprobabilities}
  \titlecaption{\label{pdfprobabilitiesfig} Wahrscheinlichkeiten bei
  einer Wahrscheinlichkeitsdichtefunktion.}{}
\end{figure}

F\"ur beliebige Bereiche ist die Wahrscheinlichkeit f\"ur den Wert $x$ zwischen
$x_1$ und $x_2$ gegeben durch
\[ P(x_1 < x < x2) = \int\limits_{x_1}^{x_2} p(x) \, dx \; . \]
Da die Wahrscheinlichkeit irgendeines Wertes $x$ Eins ergeben muss gilt die Normierung
\begin{equation}
  \label{pdfnorm}
  P(-\infty < x < \infty) = \int\limits_{-\infty}^{+\infty} p(x) \, dx = 1 \; .
\end{equation}

\pagebreak[2]
Die gesamte Funktion $p(x)$, die jedem Wert $x$ einen
Wahrscheinlichkeitsdichte zuordnet wir auch
\determ{Wahrscheinlichkeitsdichtefunktion} (\enterm{probability
  density function}, \enterm[pdf|see{probability density
  function}]{pdf}, oder kurz \enterm[density|see{probability density
  function}]{density}) genannt. Die bekannteste
Wahrscheinlichkeitsdichtefunktion ist die der \determ{Normalverteilung}
\[ p_g(x) =
\frac{1}{\sqrt{2\pi\sigma^2}}e^{-\frac{(x-\mu)^2}{2\sigma^2}} \]
--- die \determ{Gau{\ss}sche-Glockenkurve} mit Mittelwert $\mu$ und
Standardabweichung $\sigma$.

\begin{exercise}{gaussianpdf.m}{gaussianpdf.out}
  \begin{enumerate}
  \item Plotte die Wahrscheinlichkeitsdichte der Normalverteilung $p_g(x)$.
  \item Berechne f\"ur die Normalverteilung mit Mittelwert Null und
    Standardabweichung Eins die Wahrscheinlichkeit, eine Zahl zwischen
    0 und 1 zu erhalten.
  \item Ziehe 1000 normalverteilte Zufallszahlen und bestimme von
    diesen Zufallzahlen die Wahrscheinlichkeit der Zahlen zwischen
    Null und Eins.
  \item Berechne aus der Normalverteilung $\int_{-\infty}^{+\infty} p(x) \, dx$.
  \end{enumerate}
\end{exercise}

\begin{figure}[t]
  \includegraphics[width=1\textwidth]{pdfhistogram}
  \titlecaption{\label{pdfhistogramfig} Histogramme mit verschiedenen
    Klassenbreiten von normalverteilten Messwerten.}{Links: Die H\"ohe
    des absoluten Histogramms h\"angt von der Klassenbreite
    ab. Rechts: Bei auf das Integral normierten Histogrammen werden
    auch unterschiedliche Klassenbreiten untereinander vergleichbar
    und auch mit der theoretischen Wahrschinlichkeitsdichtefunktion
    (blau).}
\end{figure}

\begin{exercise}{gaussianbins.m}{}
  Draw 100 random data from a Gaussian distribution and plot
  histograms with different bin sizes of the data. What do you
  observe?
\end{exercise}

\pagebreak[2]
Damit Histogramme von reellen Messwerten trotz unterschiedlicher
Anzahl von Messungen und unterschiedlicher Klassenbreiten
untereinander vergleichbar werden und mit bekannten
Wahrscheinlichkeitsdichtefunktionen verglichen werden k\"onnen,
m\"ussen sie auf das Integral Eins normiert werden
\eqnref{pdfnorm}. Das Integral (nicht die Summe) \"uber das Histogramm
soll Eins ergeben --- denn die Wahrscheinlichkeit, dass irgendeiner
der Messwerte auftritt mu{\ss} Eins sein. Das Integral ist die
Fl\"ache des Histogramms, die sich aus der Fl\"ache der einzelnen
Histogrammbalken zusammen setzt. Die Balken des Histogramms haben die
H\"ohe $n_i$ und die Breite $\Delta x$. Die Gesamtfl\"ache $A$ des
Histogramms ist also
\[ A = \sum_{i=1}^N ( n_i \cdot \Delta x ) = \Delta x \sum_{i=1}^N n_i \]
und das normierte Histogramm hat die H\"ohe
\[ p(x_i) = \frac{n_i}{\Delta x \sum_{i=1}^N n_i} \]
Es muss also nicht nur durch die Summe, sondern auch durch die Breite
$\Delta x$ der Klassen geteilt werden (\figref{pdfhistogramfig}).

\pagebreak[4]
\begin{exercise}{gaussianbinsnorm.m}{}
  Normiere das Histogramm der vorherigen \"Ubung zu einer Wahrscheinlichkeitsdichte.
\end{exercise}


\section{Correlations}

Until now we described properties of univariate data sets.  In
bivariate or multivariate data sets where we have pairs or tuples of
data values (e.g. the size and the weight of elephants) we want to analyze
dependencies between the variables.

The \enterm{correlation coefficient}
\[ r_{x,y} = \frac{Cov(x,y)}{\sigma_x \sigma_y} = \frac{\langle
  (x-\langle x \rangle)(y-\langle y \rangle) \rangle}{\sqrt{\langle
    (x-\langle x \rangle)^2} \rangle \sqrt{\langle (y-\langle y
    \rangle)^2} \rangle} \]
quantifies linear relationships between two variables
\matlabfun{corr()}.  The correlation coefficient is the
\determ{covariance} normalized by the standard deviations of the
single variables.  Perfectly correlated variables result in a
correlation coefficient of $+1$, anit-correlated or negatively
correlated data in a correlation coefficient of $-1$ and un-correlated
data in a correlation coefficient close to zero
(\figrefb{correlationfig}).

\begin{figure}[tp]
  \includegraphics[width=1\textwidth]{correlation}
  \titlecaption{\label{correlationfig} Korrelationen zwischen Datenpaaren.}{}
\end{figure}

\begin{exercise}{correlations.m}{}
  Generate pairs of random numbers with four different correlations
  (perfectly correlated, somehow correlated, uncorrelated, negatively
  correlated).  Plot them into a scatter plot and compute their
  correlation coefficient.
\end{exercise}

Note that non-linear dependencies between two variables are
insufficiently or not at all detected by the correlation coefficient
(\figref{nonlincorrelationfig}).

\begin{figure}[tp]
  \includegraphics[width=1\textwidth]{nonlincorrelation}
  \titlecaption{\label{nonlincorrelationfig} Correlations for
    non-linear dependencies.}{The correlation coefficient detects
    linear dependencies only. Both the quadratic dependency (left) and
    the noise correlation (right), where the dispersal of the
    $y$-values depends on the $x$-value, result in correlation
    coefficients close to zero. $\xi$ denote normally distributed
    random numbers.}
\end{figure}