From b32000214a15091e3ff0a09e24ab809a1636bc95 Mon Sep 17 00:00:00 2001 From: Jan Benda Date: Mon, 25 Nov 2019 23:08:48 +0100 Subject: [PATCH] [statistics] fixed the chapter --- chapter.mk | 2 -- statistics/lecture/statistics-chapter.tex | 5 ---- statistics/lecture/statistics.tex | 30 ++++++++++++----------- 3 files changed, 16 insertions(+), 21 deletions(-) diff --git a/chapter.mk b/chapter.mk index 7bed524..494949f 100644 --- a/chapter.mk +++ b/chapter.mk @@ -9,9 +9,7 @@ PYPDFFILES=$(PYFILES:.py=.pdf) pythonplots : $(PYPDFFILES) $(PYPDFFILES) : %.pdf: %.py - echo $$(which python) python3 $< - #python $< cleanpythonplots : rm -f $(PYPDFFILES) diff --git a/statistics/lecture/statistics-chapter.tex b/statistics/lecture/statistics-chapter.tex index 8d97b6c..1a87fc8 100644 --- a/statistics/lecture/statistics-chapter.tex +++ b/statistics/lecture/statistics-chapter.tex @@ -16,11 +16,6 @@ \input{statistics} -\section{TODO} -\begin{itemize} -\item Replace exercise 1.3 (boxwhisker) by one recreating figure 1. -\end{itemize} - \end{document} diff --git a/statistics/lecture/statistics.tex b/statistics/lecture/statistics.tex index d2abdaf..834f700 100644 --- a/statistics/lecture/statistics.tex +++ b/statistics/lecture/statistics.tex @@ -307,7 +307,7 @@ $\int p_g(x) \, dx = 1$, \eqnref{pdfnorm}. \newpage Histograms of real valued data depend on both the number of data -values and the chosen bin width. As in the example with the die +values \emph{and} the chosen bin width. As in the example with the die (\figref{diehistogramsfig} left), the height of the histogram gets larger the larger the size of the data set. Also, as the bin width is increased the hight of the histogram increases, because more data @@ -315,7 +315,7 @@ values fall within each bin (\figref{pdfhistogramfig} left). \begin{exercise}{gaussianbins.m}{} Draw 100 random data from a Gaussian distribution and plot - histograms with different bin sizes of the data. What do you + histograms of the data with different bin widths. What do you observe? \end{exercise} @@ -328,8 +328,8 @@ histogram bars. Each bar has the height $n_i$ and the width $\Delta x$. The total area $A$ of the histogram is thus \[ A = \sum_{i=1}^N ( n_i \cdot \Delta x ) = \Delta x \sum_{i=1}^N n_i = N \, \Delta x \] and the normalized histogram has the heights -\[ p(x_i) = \frac{n_i}{\Delta x \sum_{i=1}^N n_i} = \frac{n_i}{N - \Delta x} \; .\] +\[ p(x_i) = \frac{n_i}{A} = \frac{n_i}{\Delta x \sum_{i=1}^N n_i} = + \frac{n_i}{N \Delta x} \; .\] A histogram needs to be divided by both the sum of the frequencies $n_i$ and the bin width $\Delta x$ to results in an estimate of the corresponding probability density. Only then can the distribution be @@ -359,33 +359,35 @@ probability density functions like the one of the normal distribution \subsection{Kernel densities} A problem of using histograms for estimating probability densities is -that they have hard bin edges. Depending on where the bin edges are placed -a data value falls in one or the other bin. +that they have hard bin edges. Depending on where the bin edges are +placed a data value falls in one or the other bin. As a result the +shape histogram depends on the exact position of its bins +(\figref{kerneldensityfig} left). \begin{figure}[t] \includegraphics[width=1\textwidth]{kerneldensity} \titlecaption{\label{kerneldensityfig} Kernel densities.}{Left: The histogram-based estimation of the probability density is dependent on the position of the bins. In the bottom plot the bins have - bin shifted by half a bin width (here $\Delta x=0.4$) and as a + been shifted by half a bin width (here $\Delta x=0.4$) and as a result details of the probability density look different. Look, - for example at the height of the largest bin. Right: In contrast, + for example, at the height of the largest bin. Right: In contrast, a kernel density is uniquely defined for a given kernel width - (here Gaussian kernels with standard deviation of $\sigma=2$).} + (here Gaussian kernels with standard deviation of $\sigma=0.2$).} \end{figure} -To avoid this problem one can use so called \enterm {kernel densities} +To avoid this problem one can use so called \enterm{kernel densities} for estimating probability densities from data. Here every data point is replaced by a kernel (a function with integral one, like for example the Gaussian) that is moved exactly to the position indicated by the data value. Then all the kernels of all the data values are summed up, the sum is divided by the number of data values, -and we get an estimate of the probability density. +and we get an estimate of the probability density +(\figref{kerneldensityfig} right). As for the histogram, where we need to choose a bin width, we need to choose the width of the kernels appropriately. -\newpage \begin{exercise}{gaussiankerneldensity.m}{} Construct and plot a kernel density of the data from the previous two exercises. @@ -430,7 +432,7 @@ and percentiles can be determined from the inverse cumulative function. by numerically integrating the normal distribution function (blue). From the cumulative distribution function one can read off the probabilities of getting values smaller than a given value - (here: $P(x \ge -1) \approx 0.15$). From the inverse cumulative + (here: $P(x \le -1) \approx 0.15$). From the inverse cumulative distribution the position of percentiles can be computed (here: the median (50\,\% percentile) is as expected close to zero and the third quartile (75\,\% percentile) at $x=0.68$.} @@ -453,7 +455,7 @@ and percentiles can be determined from the inverse cumulative function. Until now we described properties of univariate data sets. In bivariate or multivariate data sets where we have pairs or tuples of -data values (e.g. the size and the weight of elephants) we want to analyze +data values (e.g. size and weight of elephants) we want to analyze dependencies between the variables. The \enterm{correlation coefficient}