From 1d0913c600e996220adb8070cf88f6950ae40433 Mon Sep 17 00:00:00 2001
From: Jan Benda <jan.benda@uni-tuebingen.de>
Date: Sat, 9 Jan 2021 13:40:35 +0100
Subject: [PATCH] [likelihood] improved text 1

---
 likelihood/lecture/likelihood.tex | 249 +++++++++++++++++-------------
 likelihood/lecture/mlemean.py     |   4 +-
 regression/lecture/regression.tex |   1 +
 3 files changed, 148 insertions(+), 106 deletions(-)

diff --git a/likelihood/lecture/likelihood.tex b/likelihood/lecture/likelihood.tex
index 28cc395..cae9ff0 100644
--- a/likelihood/lecture/likelihood.tex
+++ b/likelihood/lecture/likelihood.tex
@@ -4,111 +4,132 @@
 \label{maximumlikelihoodchapter}
 \exercisechapter{Maximum likelihood estimation}
 
-A common problem in statistics is to estimate from a probability
-distribution one or more parameters $\theta$ that best describe the
-data $x_1, x_2, \ldots x_n$.  \enterm[maximum likelihood
-estimator]{Maximum likelihood estimators} (\enterm[mle|see{maximum
-  likelihood estimator}]{mle},
-\determ{Maximum-Likelihood-Sch\"atzer}) choose the parameters such
-that they maximize the likelihood of the data $x_1, x_2, \ldots x_n$
-to originate from the distribution.
+The core task of statistics is to infer from measured data some
+parameters describing the data. These parameters can be simply a mean,
+a standard deviation, or any other parameter needed to describe the
+distribution the data a re originating from, a correlation
+coefficient, or some parameters of a function describing a particular
+dependence between the data. The brain faces exactly the same
+problem. Given the activity pattern of some neurons (the data) it
+needs to infer some aspects (parameters) of the environment and the
+internal state of the body in order to generate some useful
+behavior. One possible approach to estimate parameters from data are
+\enterm[maximum likelihood estimator]{maximum likelihood estimators}
+(\enterm[mle|see{maximum likelihood estimator}]{mle},
+\determ{Maximum-Likelihood-Sch\"atzer}).  They choose the parameters
+such that they maximize the likelihood of the specific data values to
+originate from a specific distribution.
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{Maximum Likelihood}
+\section{Maximum likelihood}
 
 Let $p(x|\theta)$ (to be read as ``probability(density) of $x$ given
-$\theta$.'') the probability (density) distribution of $x$ given the
-parameters $\theta$. This could be the normal distribution
+$\theta$.'') the probability (density) distribution of data value $x$
+given parameter values $\theta$. This could be the normal distribution
 \begin{equation}
   \label{normpdfmean}
   p(x|\mu, \sigma) = \frac{1}{\sqrt{2\pi \sigma^2}}e^{-\frac{(x-\mu)^2}{2\sigma^2}}
 \end{equation}
 defined by the mean $\mu$ and the standard deviation $\sigma$ as
-parameters $\theta$.  If the $n$ independent observations of $x_1,
-x_2, \ldots x_n$ originate from the same probability density
-distribution (they are \enterm[i.i.d.|see{independent and identically
-  distributed}]{i.i.d.}, \enterm{independent and identically
-  distributed}) then the conditional probability $p(x_1,x_2, \ldots
-x_n|\theta)$ of observing $x_1, x_2, \ldots x_n$ given some specific
-parameter values $\theta$ is given by
+parameters $\theta$.  If the $n$ observations $x_1, x_2, \ldots, x_n$
+are independent of each other and originate from the same probability
+density distribution (they are \enterm[i.i.d.|see{independent and
+  identically distributed}]{i.i.d.}, \enterm{independent and
+  identically distributed}), then the conditional probability
+$p(x_1,x_2, \ldots, x_n|\theta)$ of observing the particular data
+values $x_1, x_2, \ldots, x_n$ given some specific parameter values
+$\theta$ of the probability density is given by the product of the
+probability densities of each data value:
 \begin{equation}
-  p(x_1,x_2, \ldots x_n|\theta) = p(x_1|\theta) \cdot p(x_2|\theta)
-  \ldots p(x_n|\theta) = \prod_{i=1}^n p(x_i|\theta) \; .
+  \label{probdata}
+  p(x_1,x_2, \ldots, x_n|\theta) = p(x_1|\theta) \cdot p(x_2|\theta)
+  \ldots, p(x_n|\theta) = \prod_{i=1}^n p(x_i|\theta) \; .
 \end{equation}
 Vice versa, the \entermde{Likelihood}{likelihood} of the parameters $\theta$
-given the observed data $x_1, x_2, \ldots x_n$ is
+given the observed data $x_1, x_2, \ldots, x_n$ is
 \begin{equation}
-  {\cal L}(\theta|x_1,x_2, \ldots x_n) = p(x_1,x_2, \ldots x_n|\theta) \; .
+  \label{likelihood}
+  {\cal L}(\theta|x_1,x_2, \ldots, x_n) = p(x_1,x_2, \ldots, x_n|\theta) \; .
 \end{equation}
-Note: the likelihood ${\cal L}$ is not a probability in the
+Note, that the likelihood ${\cal L}$ is not a probability in the
 classic sense since it does not integrate to unity ($\int {\cal
-  L}(\theta|x_1,x_2, \ldots x_n) \, d\theta \ne 1$).
+  L}(\theta|x_1,x_2, \ldots, x_n) \, d\theta \ne 1$). For given
+observations $x_1, x_2, \ldots, x_n$, the likelihood
+\eqref{likelihood} is a function of the parameters $\theta$. This
+function has a global maximum for some specific parameter values. At
+this maximum the probability \eqref{probdata} to observe the measured
+data values is the largest.
 
-When applying maximum likelihood estimations we are interested in the
-parameter values
+Maximum likelihood estimators just find the parameter values
 \begin{equation}
-  \theta_{mle} = \text{argmax}_{\theta} {\cal L}(\theta|x_1,x_2, \ldots x_n)
+  \theta_{mle} = \text{argmax}_{\theta} {\cal L}(\theta|x_1,x_2, \ldots, x_n)
 \end{equation}
-that maximize the likelihood.  $\text{argmax}_xf(x)$ is the value of
-the argument $x$ for which the function $f(x)$ assumes its global
-maximum. Thus, we search for the parameter values $\theta$ at which
-the likelihood ${\cal L}(\theta)$ reaches its maximum. For these
-paraemter values the measured data most likely originated from the
-corresponding distribution.
+that maximize the likelihood \eqref{likelihood}.
+$\text{argmax}_xf(x)$ is the value of the argument $x$ for which the
+function $f(x)$ assumes its global maximum. Thus, we search for the
+parameter values $\theta$ at which the likelihood ${\cal L}(\theta)$
+reaches its maximum. For these parameter values the measured data most
+likely originated from the corresponding distribution.
 
 The position of a function's maximum does not change when the values
 of the function are transformed by a strictly monotonously rising
 function such as the logarithm. For numerical reasons and reasons that
-we will discuss below, we search for the maximum of the logarithm of
-the likelihood
-(\entermde[likelihood!log-]{Likelihood!Log-}{log-likelihood}):
-
+we discuss below, we instead search for the maximum of the logarithm
+of the likelihood
+(\entermde[likelihood!log-]{Likelihood!Log-}{log-likelihood})
 \begin{eqnarray}
-  \theta_{mle} & = & \text{argmax}_{\theta}\; {\cal L}(\theta|x_1,x_2, \ldots x_n) \nonumber \\
-              & = & \text{argmax}_{\theta}\; \log {\cal L}(\theta|x_1,x_2, \ldots x_n) \nonumber \\
+  \theta_{mle} & = & \text{argmax}_{\theta}\; {\cal L}(\theta|x_1,x_2, \ldots, x_n) \nonumber \\
+              & = & \text{argmax}_{\theta}\; \log {\cal L}(\theta|x_1,x_2, \ldots, x_n) \nonumber \\
               & = & \text{argmax}_{\theta}\; \log \prod_{i=1}^n p(x_i|\theta) \nonumber \\
               & = & \text{argmax}_{\theta}\; \sum_{i=1}^n \log p(x_i|\theta) \label{loglikelihood}
 \end{eqnarray}
+which is the sum of the logarithms of the probabilites of each
+observation. Let's illustrate the concept of maximum likelihood
+estimation on the arithmetic mean.
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\subsection{Example: the arithmetic mean}
-Suppose that the measurements $x_1, x_2, \ldots x_n$ originate from a
-normal distribution \eqnref{normpdfmean} and we consider the mean
-$\mu$ as the only parameter $\theta$.  Which value of $\theta$
-maximizes the likelihood of the data?
+\subsection{Arithmetic mean}
+Suppose that the measurements $x_1, x_2, \ldots, x_n$ originate from a
+normal distribution \eqref{normpdfmean} and we do not know the
+population mean $\mu$ of the normal distribution
+(\figrefb{mlemeanfig}). In this setting $\mu$ is the only parameter
+$\theta$.  Which value of $\mu$ maximizes the likelihood of the data?
 
 \begin{figure}[t]
   \includegraphics[width=1\textwidth]{mlemean}
   \titlecaption{\label{mlemeanfig} Maximum likelihood estimation of
     the mean.}{Top: The measured data (blue dots) together with three
-    different possible normal distributions with different means
-    (arrows) the data could have originated from.  Bottom left: the
-    likelihood as a function of $\theta$ i.e. the mean. It is maximal
-    at a value of $\theta = 2$. Bottom right: the
-    log-likelihood. Taking the logarithm does not change the position
-    of the maximum.}
+    normal distributions differing in their means (arrows) from which
+    the data could have originated from.  Bottom left: the likelihood
+    as a function of the parameter $\mu$. For the data it is maximal
+    at a value of $\mu = 2$. Bottom right: the log-likelihood. Taking
+    the logarithm does not change the position of the maximum.}
 \end{figure}
 
-The log-likelihood \eqnref{loglikelihood}
+With the normal distribution \eqref{normpdfmean} and applying
+logarithmic identities, the log-likelihood \eqref{loglikelihood} reads
 \begin{eqnarray}
-  \log {\cal L}(\theta|x_1,x_2, \ldots x_n)
-  & = & \sum_{i=1}^n \log \frac{1}{\sqrt{2\pi \sigma^2}}e^{-\frac{(x_i-\theta)^2}{2\sigma^2}} \nonumber \\
-  & = & \sum_{i=1}^n - \log \sqrt{2\pi \sigma^2} -\frac{(x_i-\theta)^2}{2\sigma^2} \; .
+  \log {\cal L}(\mu|x_1,x_2, \ldots, x_n)
+  & = & \sum_{i=1}^n \log \frac{1}{\sqrt{2\pi \sigma^2}}e^{-\frac{(x_i-\mu)^2}{2\sigma^2}} \nonumber \\
+  & = & \sum_{i=1}^n - \log \sqrt{2\pi \sigma^2} -\frac{(x_i-\mu)^2}{2\sigma^2} \; .
 \end{eqnarray}
 Since the logarithm is the inverse function of the exponential
 ($\log(e^x)=x$), taking the logarithm removes the exponential from the
-normal distribution.  To calculate the maximum of the log-likelihood,
-we need to take the derivative with respect to $\theta$ and set it to
-zero:
+normal distribution. This is the second reason why it is useful to
+maximize the log-likelihood. To calculate the maximum of the
+log-likelihood, we need to take the derivative with respect to $\mu$
+and set it to zero:
 \begin{eqnarray}
-  \frac{\text{d}}{\text{d}\theta} \log {\cal L}(\theta|x_1,x_2, \ldots x_n) & = & \sum_{i=1}^n - \frac{2(x_i-\theta)}{2\sigma^2} \;\; = \;\; 0  \nonumber \\
-  \Leftrightarrow \quad \sum_{i=1}^n x_i - \sum_{i=1}^n \theta & = & 0  \nonumber \\
-  \Leftrightarrow \quad n \theta & = & \sum_{i=1}^n x_i  \nonumber \\
-  \Leftrightarrow \quad \theta & = & \frac{1}{n} \sum_{i=1}^n x_i \;\; = \;\; \bar x
+  \frac{\text{d}}{\text{d}\mu} \log {\cal L}(\mu|x_1,x_2, \ldots, x_n) & = & \sum_{i=1}^n - \frac{\text{d}}{\text{d}\mu} \log \sqrt{2\pi \sigma^2} - \frac{\text{d}}{\text{d}\mu} \frac{(x_i-\mu)^2}{2\sigma^2} \;\; = \;\;  0  \nonumber \\
+  \Leftrightarrow \quad \sum_{i=1}^n - \frac{2(x_i-\mu)}{2\sigma^2} & = & 0  \nonumber \\
+  \Leftrightarrow \quad \sum_{i=1}^n x_i - \sum_{i=1}^n \mu & = & 0  \nonumber \\
+  \Leftrightarrow \quad n \mu & = & \sum_{i=1}^n x_i  \nonumber \\
+  \Leftrightarrow \quad \mu & = & \frac{1}{n} \sum_{i=1}^n x_i \;\; = \;\; \bar x
 \end{eqnarray}
-Thus, the maximum likelihood estimator is the arithmetic mean. That
-is, the arithmetic mean maximizes the likelihood that the data
-originate from a normal distribution centered at the arithmetic mean
+Thus, the maximum likelihood estimator of the population mean of
+normally distributed data is the arithmetic mean. That is, the
+arithmetic mean maximizes the likelihood that the data originate from
+a normal distribution centered at the arithmetic mean
 (\figref{mlemeanfig}). Equivalently, the standard deviation computed
 from the data, maximizes the likelihood that the data were generated
 from a normal distribution with this standard deviation.
@@ -123,6 +144,17 @@ from a normal distribution with this standard deviation.
   the maxima with the mean calculated from the data.
 \end{exercise}
 
+Comparing the values of the likelihood with the ones of the
+log-likelihood shown in \figref{mlemeanfig}, shows the numerical
+reason for taking the logarithm of the likelihood. The likelihood
+values can get very small, because we multiply many, potentially small
+probability densities with each other. The likelihood quickly gets
+smaller than the samlles number a floating point number of a computer
+can represent. Try it by increasing the number of data values in the
+exercise. Taking the logarithm avoids this problem. The log-likelihood
+assumes well behaving numbers that can be handled well by the
+computer.
+
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{Fitting probability distributions}
@@ -130,32 +162,39 @@ Consider normally distributed data with unknown mean and standard
 deviation.  From the considerations above we just have seen that a
 Gaussian distribution with mean at the arithmetic mean and standard
 deviation equal to the standard deviation computed from the data is
-the best Gaussian distribution that fits the data best in a maximum
-likelihood sense, i.e. the likelihood that the data were generated
-from this distribution is the largest. Fitting a Gaussian distribution
-to data is very simple: just compute the two parameter of the Gaussian
-distribution as the arithmetic mean and a standard deviation directly
-from the data.
+the Gaussian that fits the data best in a maximum likelihood sense,
+i.e. the likelihood that the data were generated from this
+distribution is the largest. Fitting a Gaussian distribution to data
+is very simple: just compute the two parameter of the Gaussian
+distribution $\mu$ and $\sigma$ as the arithmetic mean and a standard
+deviation, respectively, directly from the data.
 
-For non-Gaussian distributions (e.g. a Gamma-distribution), however,
-such simple analytical expressions for the parameters of the
-distribution do not exist, e.g. the shape parameter of a
-\entermde[distribution!Gamma-]{Verteilung!Gamma-}{Gamma-distribution}. How
-do we fit such a distribution to some data? That is, how should we
-compute the values of the parameters of the distribution, given the
-data?
+For non-Gaussian distributions, for example a
+\entermde[distribution!Gamma-]{Verteilung!Gamma-}{Gamma-distribution}
+\begin{equation}
+  \label{gammapdf}
+  p(x|\alpha,\beta) \sim x^{\alpha-1}e^{-\beta x} \; ,
+\end{equation}
+however, such simple analytical expressions for the parameters of the
+distribution do not exist. This is the case, for example, for the
+shape parameter $\alpha$ of the Gamma-distribution. How do we fit such
+a distribution to some data?  That is, how should we compute the
+values of the parameters of the distribution, given the data?
 
 A first guess could be to fit the probability density function by
 minimization of the squared difference to a histogram of the measured
-data. For several reasons this is, however, not the method of choice:
-(i) Probability densities can only be positive which leads, for small
-values in particular, to asymmetric distributions. (ii) The values of
-a histogram estimating the density are not independent because the
-integral over a density is unity. The two basic assumptions of
-normally distributed and independent samples, which are a prerequisite
-make the minimization of the squared difference \eqnref{chisqmin} to a
-maximum likelihood estimation, are violated. (iii) The histogram
-strongly depends on the chosen bin size \figref{mlepdffig}).
+data in the same way as we fit a a function to some data. For several
+reasons this is, however, not the method of choice: (i) Probability
+densities can only be positive which leads, for small values in
+particular, to asymmetric distributions of the estimated histogram
+around the true density. (ii) The values of a histogram estimating the
+density are not independent because the integral over a density is
+unity. The two basic assumptions of normally distributed and
+independent samples, which are a prerequisite for making the
+minimization of the squared difference to a maximum likelihood
+estimation (see next section), are violated. (iii) The estimation of
+the probability density by means of a histogram strongly depends on
+the chosen bin size \figref{mlepdffig}).
 
 \begin{figure}[t]
   \includegraphics[width=1\textwidth]{mlepdf}
@@ -173,11 +212,10 @@ Instead we should stay with maximum-likelihood estimation.  Exactly in
 the same way as we estimated the mean value of a Gaussian distribution
 above, we can numerically fit the parameter of any type of
 distribution directly from the data by means of maximizing the
-likelihood.  We simply search for the parameter $\theta$ of the
-desired probability density function that maximizes the
-log-likelihood. In general this is a non-linear optimization problem
-that is solved with numerical methods such as the gradient descent
-\matlabfun{mle()}.
+likelihood.  We simply search for the parameter values of the desired
+probability density function that maximize the log-likelihood. In
+general this is a non-linear optimization problem that is solved with
+numerical methods such as the gradient descent \matlabfun{mle()}.
 
 \begin{exercise}{mlegammafit.m}{mlegammafit.out}
   Generate a sample of gamma-distributed random numbers and apply the
@@ -191,12 +229,16 @@ that is solved with numerical methods such as the gradient descent
 
 When fitting a function of the form $f(x;\theta)$ to data pairs
 $(x_i|y_i)$ one tries to adapt the parameter $\theta$ such that the
-function best describes the data. With maximum likelihood we search
-for the parameter value $\theta$ for which the likelihood that the data
-were drawn from the corresponding function is maximal.  If we assume
-that the $y_i$ values are normally distributed around the function
-values $f(x_i;\theta)$ with a standard deviation $\sigma_i$, the
-log-likelihood is
+function best describes the data. In
+chapter~\ref{gradientdescentchapter} we simply assumed that ``best''
+means minimizing the squared distance between the data and the
+function.  With maximum likelihood we search for the parameter value
+$\theta$ for which the likelihood that the data were drawn from the
+corresponding function is maximal.
+
+If we assume that the $y_i$ values are normally distributed around the
+function values $f(x_i;\theta)$ with a standard deviation $\sigma_i$,
+the log-likelihood is
 \begin{eqnarray}
   \log {\cal L}(\theta|(x_1,y_1,\sigma_1), \ldots, (x_n,y_n,\sigma_n))
   & = & \sum_{i=1}^n \log \frac{1}{\sqrt{2\pi \sigma_i^2}}e^{-\frac{(y_i-f(x_i;\theta))^2}{2\sigma_i^2}}  \nonumber \\
@@ -218,18 +260,17 @@ the position of the minimum:
   \theta_{mle} = \text{argmin}_{\theta} \; \sum_{i=1}^n \left( \frac{y_i-f(x_i;\theta)}{\sigma_i} \right)^2 \;\; = \;\; \text{argmin}_{\theta} \; \chi^2
 \end{equation}
 The sum of the squared differences normalized by the standard
-deviation is also called $\chi^2$. The parameter $\theta$ which
-minimizes the squared differences is thus the one that maximizes the
-likelihood that the data actually originate from the given
-function. Minimizing $\chi^2$ therefore is a maximum likelihood
+deviation is also called $\chi^2$ (chi squared). The parameter
+$\theta$ which minimizes the squared differences is thus the one that
+maximizes the likelihood of the data to actually originate from the
+given function. Therefore, minimizing $\chi^2$ is a maximum likelihood
 estimation.
 
 From the mathematical considerations above we can see that the
 minimization of the squared difference is a maximum-likelihood
 estimation only if the data are normally distributed around the
 function. In case of other distributions, the log-likelihood
-\eqnref{loglikelihood} needs to be adapted accordingly and be
-maximized respectively.
+\eqnref{loglikelihood} needs to be adapted accordingly.
 
 \begin{figure}[t]
   \includegraphics[width=1\textwidth]{mlepropline}
@@ -377,7 +418,7 @@ orientation $\phi$ of an edge is given by
 The log-likelihood of the edge orientation $\phi$ given the
 activity pattern in the population $r_1$, $r_2$, ... $r_n$ is thus
 \begin{equation}
-  {\cal L}(\phi|r_1, r_2, \ldots r_n) = \sum_{i=1}^n \log p_i(r_i|\phi)
+  {\cal L}(\phi|r_1, r_2, \ldots, r_n) = \sum_{i=1}^n \log p_i(r_i|\phi)
 \end{equation}
 The angle $\phi$ that maximizes this likelihood is then an estimate of
 the orientation of the edge.
diff --git a/likelihood/lecture/mlemean.py b/likelihood/lecture/mlemean.py
index a1fb974..e18618d 100644
--- a/likelihood/lecture/mlemean.py
+++ b/likelihood/lecture/mlemean.py
@@ -52,7 +52,7 @@ for i, theta in enumerate(thetas) :
 p=np.prod(ps,axis=0)
 # plot it:
 ax = fig.add_subplot(spec[1, 0])
-ax.set_xlabel(r'Parameter $\theta$')
+ax.set_xlabel(r'Parameter $\mu$')
 ax.set_ylabel('Likelihood')
 ax.set_xticks(np.arange(1.6, 2.5, 0.4))
 ax.annotate('Maximum',
@@ -68,7 +68,7 @@ ax.annotate('',
 ax.plot(thetas, p, **lsAm)
 
 ax = fig.add_subplot(spec[1, 1])
-ax.set_xlabel(r'Parameter $\theta$')
+ax.set_xlabel(r'Parameter $\mu$')
 ax.set_ylabel('Log-Likelihood')
 ax.set_ylim(-50,-20)
 ax.set_xticks(np.arange(1.6, 2.5, 0.4))
diff --git a/regression/lecture/regression.tex b/regression/lecture/regression.tex
index 2d13b96..6199650 100644
--- a/regression/lecture/regression.tex
+++ b/regression/lecture/regression.tex
@@ -1,4 +1,5 @@
 \chapter{Optimization and gradient descent}
+\label{gradientdescentchapter}
 \exercisechapter{Optimization and gradient descent}
 
 Optimization problems arise in many different contexts. For example,