From b32000214a15091e3ff0a09e24ab809a1636bc95 Mon Sep 17 00:00:00 2001
From: Jan Benda <jan.benda@uni-tuebingen.de>
Date: Mon, 25 Nov 2019 23:08:48 +0100
Subject: [PATCH] [statistics] fixed the chapter

---
 chapter.mk                                |  2 --
 statistics/lecture/statistics-chapter.tex |  5 ----
 statistics/lecture/statistics.tex         | 30 ++++++++++++-----------
 3 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/chapter.mk b/chapter.mk
index 7bed524..494949f 100644
--- a/chapter.mk
+++ b/chapter.mk
@@ -9,9 +9,7 @@ PYPDFFILES=$(PYFILES:.py=.pdf)
 pythonplots : $(PYPDFFILES)
 
 $(PYPDFFILES) : %.pdf: %.py
-	echo $$(which python)
 	python3 $<
-	#python $<
 
 cleanpythonplots :
 	rm -f $(PYPDFFILES)
diff --git a/statistics/lecture/statistics-chapter.tex b/statistics/lecture/statistics-chapter.tex
index 8d97b6c..1a87fc8 100644
--- a/statistics/lecture/statistics-chapter.tex
+++ b/statistics/lecture/statistics-chapter.tex
@@ -16,11 +16,6 @@
 
 \input{statistics}
 
-\section{TODO}
-\begin{itemize}
-\item Replace exercise 1.3 (boxwhisker) by one recreating figure 1.
-\end{itemize}
-
 \end{document}
 
 
diff --git a/statistics/lecture/statistics.tex b/statistics/lecture/statistics.tex
index d2abdaf..834f700 100644
--- a/statistics/lecture/statistics.tex
+++ b/statistics/lecture/statistics.tex
@@ -307,7 +307,7 @@ $\int p_g(x) \, dx = 1$, \eqnref{pdfnorm}.
 
 \newpage
 Histograms of real valued data depend on both the number of data
-values and the chosen bin width. As in the example with the die
+values \emph{and} the chosen bin width. As in the example with the die
 (\figref{diehistogramsfig} left), the height of the histogram gets
 larger the larger the size of the data set. Also, as the bin width is
 increased the hight of the histogram increases, because more data
@@ -315,7 +315,7 @@ values fall within each bin (\figref{pdfhistogramfig} left).
 
 \begin{exercise}{gaussianbins.m}{}
   Draw 100 random data from a Gaussian distribution and plot
-  histograms with different bin sizes of the data. What do you
+  histograms of the data with different bin widths. What do you
   observe?
 \end{exercise}
 
@@ -328,8 +328,8 @@ histogram bars. Each bar has the height $n_i$ and the width $\Delta
 x$.  The total area $A$ of the histogram is thus
 \[ A = \sum_{i=1}^N ( n_i \cdot \Delta x ) = \Delta x \sum_{i=1}^N n_i = N \, \Delta x \]
 and the normalized histogram has the heights
-\[ p(x_i) = \frac{n_i}{\Delta x \sum_{i=1}^N n_i} = \frac{n_i}{N
-  \Delta x} \; .\] 
+\[ p(x_i) = \frac{n_i}{A} = \frac{n_i}{\Delta x \sum_{i=1}^N n_i} =
+   \frac{n_i}{N \Delta x} \; .\]
 A histogram needs to be divided by both the sum of the frequencies
 $n_i$ and the bin width $\Delta x$ to results in an estimate of the
 corresponding probability density. Only then can the distribution be
@@ -359,33 +359,35 @@ probability density functions like the one of the normal distribution
 \subsection{Kernel densities}
 
 A problem of using histograms for estimating probability densities is
-that they have hard bin edges. Depending on where the bin edges are placed
-a data value falls in one or the other bin. 
+that they have hard bin edges. Depending on where the bin edges are
+placed a data value falls in one or the other bin. As a result the
+shape histogram depends on the exact position of its bins
+(\figref{kerneldensityfig} left).
 
 \begin{figure}[t]
   \includegraphics[width=1\textwidth]{kerneldensity}
   \titlecaption{\label{kerneldensityfig} Kernel densities.}{Left: The
     histogram-based estimation of the probability density is dependent
     on the position of the bins. In the bottom plot the bins have
-    bin shifted by half a bin width (here $\Delta x=0.4$) and as a
+    been shifted by half a bin width (here $\Delta x=0.4$) and as a
     result details of the probability density look different. Look,
-    for example at the height of the largest bin. Right: In contrast,
+    for example, at the height of the largest bin. Right: In contrast,
     a kernel density is uniquely defined for a given kernel width
-    (here Gaussian kernels with standard deviation of $\sigma=2$).}
+    (here Gaussian kernels with standard deviation of $\sigma=0.2$).}
 \end{figure}
 
-To avoid this problem one can use so called \enterm {kernel densities}
+To avoid this problem one can use so called \enterm{kernel densities}
 for estimating probability densities from data. Here every data point
 is replaced by a kernel (a function with integral one, like for
 example the Gaussian) that is moved exactly to the position
 indicated by the data value. Then all the kernels of all the data
 values are summed up, the sum is divided by the number of data values,
-and we get an estimate of the probability density.
+and we get an estimate of the probability density
+(\figref{kerneldensityfig} right).
 
 As for the histogram, where we need to choose a bin width, we need to
 choose the width of the kernels appropriately.
 
-\newpage
 \begin{exercise}{gaussiankerneldensity.m}{}
   Construct and plot a kernel density of the data from the previous
   two exercises.
@@ -430,7 +432,7 @@ and percentiles can be determined from the inverse cumulative function.
     by numerically integrating the normal distribution function
     (blue). From the cumulative distribution function one can read off
     the probabilities of getting values smaller than a given value
-    (here: $P(x \ge -1) \approx 0.15$). From the inverse cumulative
+    (here: $P(x \le -1) \approx 0.15$). From the inverse cumulative
     distribution the position of percentiles can be computed (here:
     the median (50\,\% percentile) is as expected close to zero and
     the third quartile (75\,\% percentile) at $x=0.68$.}
@@ -453,7 +455,7 @@ and percentiles can be determined from the inverse cumulative function.
 
 Until now we described properties of univariate data sets.  In
 bivariate or multivariate data sets where we have pairs or tuples of
-data values (e.g. the size and the weight of elephants) we want to analyze
+data values (e.g. size and weight of elephants) we want to analyze
 dependencies between the variables.
 
 The \enterm{correlation coefficient}