updated plots of the data analysis chapters

2020-01-14 23:38:16 +01:00
parent 75aa46d71c
commit 9769d5e94f
18 changed files with 226 additions and 272 deletions
--- a/statistics/lecture/cumulative.py
+++ b/statistics/lecture/cumulative.py
@@ -24,17 +24,17 @@ ax.set_ylim(-0.05, 1.05)
 ax.set_yticks(np.arange(0.0, 1.1, 0.2))

 med = xs[cdf>=0.5][0]
-ax.plot([-3.2, med, med], [0.5, 0.5, 0.0], zorder=-5, **lsMarker)
+ax.plot([-3.2, med, med], [0.5, 0.5, 0.0], zorder=-5, **lsSpine)
 ax.text(-2.8, 0.55, 'F=0.5')
 ax.text(0.15, 0.25, 'median at %.2f' % med)

 q3 = xs[cdf>=0.75][0]
-ax.plot([-3.2, q3, q3], [0.75, 0.75, 0.0], zorder=-5, **lsMarker)
+ax.plot([-3.2, q3, q3], [0.75, 0.75, 0.0], zorder=-5, **lsSpine)
 ax.text(-2.8, 0.8, 'F=0.75')
 ax.text(0.8, 0.5, '3. quartile at %.2f' % q3)

 p = cdf[xs>=-1.0][0]
-ax.plot([-3.2, -1.0, -1.0], [p, p, 0.0], zorder=-5, **lsMarker)
+ax.plot([-3.2, -1.0, -1.0], [p, p, 0.0], zorder=-5, **lsSpine)
 ax.text(-2.8, 0.2, 'F=%.2f' % p)
 ax.text(-0.9, 0.05, '-1')

--- a/statistics/lecture/diehistograms.py
+++ b/statistics/lecture/diehistograms.py
@@ -25,6 +25,6 @@ ax2.set_xticks(range(1, 7))
 ax2.set_xlabel('x')
 ax2.set_ylim(0, 0.23)
 ax2.set_ylabel('Probability')
-ax2.plot([0.2, 6.8], [1.0/6.0, 1.0/6.0], zorder=1, **lsAm)
-ax2.hist([x2, x1], bins, normed=True, zorder=10, **fs)
+ax2.plot([0.2, 6.8], [1.0/6.0, 1.0/6.0], zorder=-10, **lsAm)
+ax2.hist([x2, x1], bins, normed=True, zorder=-5, **fs)
 fig.savefig('diehistograms.pdf')
--- a/statistics/lecture/displayunivariatedata.py
+++ b/statistics/lecture/displayunivariatedata.py
@@ -14,7 +14,7 @@ scatterpos = 1.0
 barpos = 2.5
 boxpos = 4.0

-fig = plt.figure(figsize=cm_size(figure_width, 1.2*figure_height))
+fig = plt.figure(figsize=cm_size(figure_width, 1.1*figure_height))
 spec = gridspec.GridSpec(nrows=1, ncols=2, width_ratios=[3, 1], wspace=0.1,
                         **adjust_fs(fig, left=4.0))

@@ -53,7 +53,7 @@ ax.set_xticklabels([])
 ax = fig.add_subplot(spec[0, 0])
 ax.set_xlim(0.0, 4.8)
 ax.set_xticks([scatterpos, barpos, boxpos])
-ax.set_xticklabels(['(1) data', '(2) bar\n plot', '(3) box-\nwhisker'])
+ax.set_xticklabels(['(1) data', '(2) bar\n plot', '(3) box-\nwhisker'], fontsize='medium')
 ax.set_ylabel('x')
 ax.set_ylim( 0.0, 8.0)

@@ -85,7 +85,7 @@ ax = fig.add_subplot(spec[0, 1])
 ax.set_yticklabels([])
 ax.set_ylim( 0.0, 8.0)
 ax.set_xticks(np.arange(0.0, 0.4, 0.1))
-ax.set_xlabel('(4) p(x)')
+ax.set_xlabel('(4) pdf')
 bw = 0.75
 bins = np.arange(0, 8.0+bw, bw)
 h, b = np.histogram(data, bins)
--- a/statistics/lecture/kerneldensity.py
+++ b/statistics/lecture/kerneldensity.py
@@ -60,7 +60,7 @@ ax = fig.add_subplot(spec[:, 1])
 ax.set_xlabel('x')
 ax.set_xlim(-3.2, 3.2)
 ax.set_xticks(np.arange(-3.0, 3.1, 1.0))
-ax.set_ylabel('Probab. density p(x)')
+ax.set_ylabel('Prob. density p(x)')
 ax.set_ylim(0.0, 0.49)
 ax.set_yticks(np.arange(0.0, 0.41, 0.1))
 kd, xx = kerneldensity(r, -3.2, 3.2, 0.2)
--- a/statistics/lecture/statistics.tex
+++ b/statistics/lecture/statistics.tex
@@ -115,17 +115,9 @@ function \mcode{median()} computes the median.
  writing reliable code!
 \end{exercise}

-\begin{figure}[t]
-  \includegraphics[width=1\textwidth]{quartile}
-  \titlecaption{\label{quartilefig} Median and quartiles of a normal
-    distribution.}{ The interquartile range between the first and the
-    third quartile contains 50\,\% of the data and contains the
-    median.}
-\end{figure}
-
 The distribution of data can be further characterized by the position
 of its \entermde[quartile]{Quartil}{quartiles}. Neighboring quartiles are
-separated by 25\,\% of the data (\figref{quartilefig}).
+separated by 25\,\% of the data.% (\figref{quartilefig}).
 \entermde[percentile]{Perzentil}{Percentiles} allow to characterize the
 distribution of the data in more detail. The 3$^{\rm rd}$ quartile
 corresponds to the 75$^{\rm th}$ percentile, because 75\,\% of the
@@ -156,15 +148,13 @@ median that extends from the 1$^{\rm st}$ to the 3$^{\rm rd}$
 quartile. The whiskers mark the minimum and maximum value of the data
 set (\figref{displayunivariatedatafig} (3)).

-\begin{exercise}{univariatedata.m}{}
-  Generate 40 normally distributed random numbers with a mean of 2 and
-  illustrate their distribution in a box-whisker plot
-  (\code{boxplot()} function), with a bar and errorbar illustrating
-  the mean and standard deviation (\code{bar()}, \code{errorbar()}),
-  and the data themselves jittered randomly (as in
-  \figref{displayunivariatedatafig}).  How to interpret the different
-  plots?
-\end{exercise}
+% \begin{figure}[t]
+%   \includegraphics[width=1\textwidth]{quartile}
+%   \titlecaption{\label{quartilefig} Median and quartiles of a normal
+%     distribution.}{ The interquartile range between the first and the
+%     third quartile contains 50\,\% of the data and contains the
+%     median.}
+% \end{figure}

 % \begin{exercise}{boxwhisker.m}{}
 %   Generate a $40 \times 10$ matrix of random numbers and
@@ -201,6 +191,16 @@ Histograms are often used to estimate the
 \enterm[probability!distribution]{probability distribution}
 (\determ[Wahrscheinlichkeits!-verteilung]{Wahrscheinlichkeitsverteilung}) of the data values.

+\begin{exercise}{univariatedata.m}{}
+  Generate 40 normally distributed random numbers with a mean of 2 and
+  illustrate their distribution in a box-whisker plot
+  (\code{boxplot()} function), with a bar and errorbar illustrating
+  the mean and standard deviation (\code{bar()}, \code{errorbar()}),
+  and the data themselves jittered randomly (as in
+  \figref{displayunivariatedatafig}).  How to interpret the different
+  plots?
+\end{exercise}
+
 \subsection{Probabilities}
 In the frequentist interpretation of probability, the
 \enterm{probability} (\determ{Wahrscheinlichkeit}) of an event
@@ -252,7 +252,7 @@ real number like, e.g., 0.123456789 is zero, because there are
 uncountable many real numbers.

 We can only ask for the probability to get a measurement value in some
-range.  For example, we can ask for the probability $P(1.2<x<1.3)$ to
+range.  For example, we can ask for the probability $P(0<x<1)$ to
 get a measurement between 0 and 1 (\figref{pdfprobabilitiesfig}). More
 generally, we want to know the probability $P(x_0<x<x_1)$ to obtain a
 measurement between $x_0$ and $x_1$. If we define the width of the
@@ -280,7 +280,7 @@ inverse of the unit of the data values --- hence the name ``density''.
 \end{figure}

 The probability to get a value $x$ between $x_1$ and $x_2$ is 
-given by the integral of the probability density:
+given by an integral over the probability density:
 \[ P(x_1 < x < x2) = \int\limits_{x_1}^{x_2} p(x) \, dx \; . \]
 Because the probability to get any value $x$ is one, the integral of
 the probability density over the whole real axis must be one:
@@ -329,7 +329,7 @@ values fall within each bin (\figref{pdfhistogramfig} left).
  observe?
 \end{exercise}

-To turn such histograms to estimates of probability densities they
+To turn such histograms into estimates of probability densities they
 need to be normalized such that according to \eqnref{pdfnorm} their
 integral equals one. While histograms of categorical data are
 normalized such that their sum equals one, here we need to integrate
@@ -343,7 +343,7 @@ and the
 \[ p(x_i) = \frac{n_i}{A} = \frac{n_i}{\Delta x \sum_{i=1}^N n_i} =
   \frac{n_i}{N \Delta x} \; .\]
 A histogram needs to be divided by both the sum of the frequencies
-$n_i$ and the bin width $\Delta x$ to results in an estimate of the
+$n_i$ and the bin width $\Delta x$ to result in an estimate of the
 corresponding probability density. Only then can the distribution be
 compared with other distributions and in particular with theoretical
 probability density functions like the one of the normal distribution
@@ -371,19 +371,20 @@ probability density functions like the one of the normal distribution
 A problem of using histograms for estimating probability densities is
 that they have hard bin edges. Depending on where the bin edges are
 placed a data value falls in one or the other bin. As a result the
-shape histogram depends on the exact position of its bins
-(\figref{kerneldensityfig} left).
+shape of the resulting histogram depends on the exact position of its
+bins (\figref{kerneldensityfig} left).

 \begin{figure}[t]
  \includegraphics[width=1\textwidth]{kerneldensity}
-  \titlecaption{\label{kerneldensityfig} Kernel densities.}{Left: The
-    histogram-based estimation of the probability density is dependent
-    on the position of the bins. In the bottom plot the bins have
+  \titlecaption{\label{kerneldensityfig} Kernel densities.}{The
+    histogram-based estimation of the probability density depends on
+    the position of the bins (left). In the bottom plot the bins have
    been shifted by half a bin width (here $\Delta x=0.4$) and as a
    result details of the probability density look different. Look,
-    for example, at the height of the largest bin. Right: In contrast,
-    a kernel density is uniquely defined for a given kernel width
-    (here Gaussian kernels with standard deviation of $\sigma=0.2$).}
+    for example, at the height of the largest bin. In contrast, a
+    kernel density is uniquely defined for a given kernel width
+    (right, Gaussian kernels with standard deviation of
+    $\sigma=0.2$).}
 \end{figure}

 To avoid this problem so called \entermde[kernel
@@ -460,7 +461,6 @@ and percentiles can be determined from the inverse cumulative function.
  Use the estimate to compute the value of the 5\,\% percentile.
 \end{exercise}

-\newpage
 \section{Correlations}

 Until now we described properties of univariate data sets.  In