From 2e0b2a52392b262c0be2281799d15753b2c68a3d Mon Sep 17 00:00:00 2001
From: Jan Benda <jan.benda@uni-tuebingen.de>
Date: Tue, 26 Nov 2019 15:37:28 +0100
Subject: [PATCH] [likelihood] fixed english text

---
 likelihood/lecture/likelihood.tex | 173 ++++++++++++++----------------
 likelihood/lecture/mlepdf.py      |   5 +-
 2 files changed, 87 insertions(+), 91 deletions(-)

diff --git a/likelihood/lecture/likelihood.tex b/likelihood/lecture/likelihood.tex
index dd1f695..0579409 100644
--- a/likelihood/lecture/likelihood.tex
+++ b/likelihood/lecture/likelihood.tex
@@ -4,15 +4,13 @@
 \chapter{\tr{Maximum likelihood estimation}{Maximum-Likelihood-Sch\"atzer}}
 \label{maximumlikelihoodchapter}
 
-\selectlanguage{english}
-
-There are situations in which we want to estimate one or more
-parameters $\theta$ of a probability distribution that best describe
-the data $x_1, x_2, \ldots x_n$.  \enterm{Maximum likelihood
-  estimators} (\determ{Maximum-Likelihood-Sch\"atzer},
-\determ[mle|see{Maximum-Likelihood-Sch\"atzer}]{mle}) choose the
-parameters such that it maximizes the likelihood of $x_1, x_2, \ldots
-x_n$ originating from the distribution.
+A common problem in statistics is to estimate from a probability
+distribution one or more parameters $\theta$ that best describe the
+data $x_1, x_2, \ldots x_n$.  \enterm{Maximum likelihood estimators}
+(\enterm[mle|see{Maximum likelihood estimators}]{mle},
+\determ{Maximum-Likelihood-Sch\"atzer}) choose the parameters such
+that they maximize the likelihood of the data $x_1, x_2, \ldots x_n$
+to originate from the distribution.
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{Maximum Likelihood}
@@ -22,20 +20,21 @@ $\theta$.'') the probability (density) distribution of $x$ given the
 parameters $\theta$. This could be the normal distribution
 \begin{equation}
   \label{normpdfmean}
-  p(x|\theta) = \frac{1}{\sqrt{2\pi \sigma^2}}e^{-\frac{(x-\mu)^2}{2\sigma^2}}
+  p(x|\mu, \sigma) = \frac{1}{\sqrt{2\pi \sigma^2}}e^{-\frac{(x-\mu)^2}{2\sigma^2}}
 \end{equation}
-defined by the mean ($\mu$) and the standard deviation $\sigma$ as
+defined by the mean $\mu$ and the standard deviation $\sigma$ as
 parameters $\theta$.  If the $n$ independent observations of $x_1,
 x_2, \ldots x_n$ originate from the same probability density
-distribution (\enterm{i.i.d.} independent and identically distributed)
-then is the conditional probability $p(x_1,x_2, \ldots x_n|\theta)$ of 
-observing $x_1, x_2, \ldots x_n$ given the a specific $\theta$,
+distribution (they are \enterm{i.i.d.} independent and identically
+distributed) then the conditional probability $p(x_1,x_2, \ldots
+x_n|\theta)$ of observing $x_1, x_2, \ldots x_n$ given a specific
+$\theta$ is given by
 \begin{equation}
   p(x_1,x_2, \ldots x_n|\theta) = p(x_1|\theta) \cdot p(x_2|\theta)
   \ldots p(x_n|\theta) = \prod_{i=1}^n p(x_i|\theta) \; .
 \end{equation}
-Vice versa, is the \enterm{likelihood} of the parameters $\theta$
-given the observed data $x_1, x_2, \ldots x_n$,
+Vice versa, the \enterm{likelihood} of the parameters $\theta$
+given the observed data $x_1, x_2, \ldots x_n$ is
 \begin{equation}
   {\cal L}(\theta|x_1,x_2, \ldots x_n) = p(x_1,x_2, \ldots x_n|\theta) \; .
 \end{equation}
@@ -44,14 +43,14 @@ classic sense since it does not integrate to unity ($\int {\cal
   L}(\theta|x_1,x_2, \ldots x_n) \, d\theta \ne 1$).
 
 When applying maximum likelihood estimations we are interested in the
-parameters $\theta$ that maximize the likelihood (``mle''):
+parameter values
 \begin{equation}
   \theta_{mle} = \text{argmax}_{\theta} {\cal L}(\theta|x_1,x_2, \ldots x_n)
 \end{equation}
-$\text{argmax}_xf(x)$ denotes the values of the argument $x$ of the
-function $f(x)$ at which the function $f(x)$ reaches its global
-maximum. Thus, we search the value of $\theta$ at which the
-likelihood ${\cal L}(\theta)$ reaches its maximum.
+that maximize the likelihood.  $\text{argmax}_xf(x)$ is the value of
+the argument $x$ of the function $f(x)$ for which the function $f(x)$
+assumes its global maximum. Thus, we search for the value of $\theta$
+at which the likelihood ${\cal L}(\theta)$ reaches its maximum.
 
 The position of a function's maximum does not change when the values
 of the function are transformed by a strictly monotonously rising
@@ -70,27 +69,27 @@ the likelihood (\enterm{log-likelihood}):
 \subsection{Example: the arithmetic mean}
 Suppose that the measurements $x_1, x_2, \ldots x_n$ originate from a
 normal distribution \eqnref{normpdfmean} and we consider the mean
-$\mu=\theta$ as the only parameter.  Which value of $\theta$ maximizes
-its likelihood?
+$\mu$ as the only parameter $\theta$.  Which value of $\theta$
+maximizes the likelihood of the data?
 
 \begin{figure}[t]
   \includegraphics[width=1\textwidth]{mlemean}
   \titlecaption{\label{mlemeanfig} Maximum likelihood estimation of
     the mean.}{Top: The measured data (blue dots) together with three
     different possible normal distributions with different means
-    (arrows) that could be the origin of the data.  Bootom left: the
+    (arrows) the data could have originated from.  Bootom left: the
     likelihood as a function of $\theta$ i.e. the mean. It is maximal
     at a value of $\theta = 2$. Bottom right: the
     log-likelihood. Taking the logarithm does not change the position
     of the maximum.}
 \end{figure}
+
 The log-likelihood \eqnref{loglikelihood}
 \begin{eqnarray*}
   \log {\cal L}(\theta|x_1,x_2, \ldots x_n)
   & = & \sum_{i=1}^n \log \frac{1}{\sqrt{2\pi \sigma^2}}e^{-\frac{(x_i-\theta)^2}{2\sigma^2}} \\
   & = & \sum_{i=1}^n - \log \sqrt{2\pi \sigma^2} -\frac{(x_i-\theta)^2}{2\sigma^2} \; .
 \end{eqnarray*}
-% FIXME do we need parentheses around the normal distribution in line one?
 Since the logarithm is the inverse function of the exponential
 ($\log(e^x)=x$), taking the logarithm removes the exponential from the
 normal distribution.  To calculate the maximum of the log-likelihood,
@@ -102,12 +101,9 @@ zero:
   \Leftrightarrow \quad n \theta & = & \sum_{i=1}^n x_i \\
   \Leftrightarrow \quad \theta & = & \frac{1}{n} \sum_{i=1}^n x_i \;\; = \;\; \bar x
 \end{eqnarray*}
-From the above equations it becomes clear that the maximum likelihood
-estimation is equivalent to the mean of the data. That is, the
-assuming the mean of the data as $\theta$ maximizes the likelihood
-that the data originate from a normal distribution with that mean
-(\figref{mlemeanfig}).
-
+Thus, the maximum likelihood estimator is the arithmetic mean. That
+is, the arithmetic mean maximizes the likelihood that the data
+originate from a normal distribution (\figref{mlemeanfig}).
 
 \begin{exercise}{mlemean.m}{mlemean.out}
   Draw $n=50$ random numbers from a normal distribution with a mean of
@@ -123,14 +119,16 @@ that the data originate from a normal distribution with that mean
 
 \pagebreak[4]
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{Curve fitting as using maximum-likelihood estimation}
-
-During curve fitting a function of the form $f(x;\theta)$ with the
-parameter $\theta$ is adapted to the data pairs $(x_i|y_i)$ by
-adapting $\theta$. When we assume that the $y_i$ values are normally
-distributed around the function values $f(x_i;\theta)$ with a standard
-deviation $\sigma_i$, the log-likelihood is
-
+\section{Curve fitting}
+
+When fitting a function of the form $f(x;\theta)$ to data pairs
+$(x_i|y_i)$ one tries to adapt the parameter $\theta$ such that the
+function best describes the data. With maximum likelihood we search
+for the paramter value $\theta$ for which the likelihood that the data
+were drawn from the corresponding function is maximal.  If we assume
+that the $y_i$ values are normally distributed around the function
+values $f(x_i;\theta)$ with a standard deviation $\sigma_i$, the
+log-likelihood is
 \begin{eqnarray*}
   \log {\cal L}(\theta|(x_1,y_1,\sigma_1), \ldots, (x_n,y_n,\sigma_n))
   & = & \sum_{i=1}^n \log \frac{1}{\sqrt{2\pi \sigma_i^2}}e^{-\frac{(y_i-f(x_i;\theta))^2}{2\sigma_i^2}} \\
@@ -138,34 +136,32 @@ deviation $\sigma_i$, the log-likelihood is
 \end{eqnarray*}
 The only difference to the previous example is that the averages in
 the equations above are now given as the function values
-$f(x_i;\theta)$
-
-The parameter $\theta$ should be the one that maximizes the
-log-likelihood. The first part of the sum is independent of $\theta$
-and can thus be ignored during the search of the maximum:
+$f(x_i;\theta)$. The parameter $\theta$ should be the one that
+maximizes the log-likelihood. The first part of the sum is independent
+of $\theta$ and can thus be ignored when computing the the maximum:
 \begin{eqnarray*}
   & = & - \frac{1}{2} \sum_{i=1}^n \left( \frac{y_i-f(x_i;\theta)}{\sigma_i} \right)^2
 \end{eqnarray*}
 We can further simplify by inverting the sign and then search for the
-minimum. Also the $1/2$ factor can be ignored since it does not affect
+minimum. Also the factor $1/2$ can be ignored since it does not affect
 the position of the minimum:
 \begin{equation}
   \label{chisqmin}
   \theta_{mle} = \text{argmin}_{\theta} \; \sum_{i=1}^n \left( \frac{y_i-f(x_i;\theta)}{\sigma_i} \right)^2 \;\; = \;\; \text{argmin}_{\theta} \; \chi^2
 \end{equation}
-The sum of the squared differences when normalized to the standard
+The sum of the squared differences normalized by the standard
 deviation is also called $\chi^2$. The parameter $\theta$ which
 minimizes the squared differences is thus the one that maximizes the
-probability that the data actually originate from the given
+likelihood that the data actually originate from the given
 function. Minimizing $\chi^2$ therefore is a maximum likelihood
 estimation.
 
 From the mathematical considerations above we can see that the
 minimization of the squared difference is a maximum-likelihood
 estimation only if the data are normally distributed around the
-function. In case of other distributions, the log-likelihood needs to
-be adapted accordingly \eqnref{loglikelihood} and be maximized
-respectively.
+function. In case of other distributions, the log-likelihood
+\eqnref{loglikelihood} needs to be adapted accordingly and be
+maximized respectively.
 
 \begin{figure}[t]
   \includegraphics[width=1\textwidth]{mlepropline}
@@ -175,9 +171,9 @@ respectively.
 
 
 \subsection{Example: simple proportionality}
-The function of a line going through the origin
-\[ f(x) = \theta x  \]
-with the slope $\theta$. The $\chi^2$-sum is thus
+The function of a line with slope $\theta$ through the origin is
+\[ f(x) = \theta x \; . \]
+The $\chi^2$-sum is thus
 \[ \chi^2 = \sum_{i=1}^n \left( \frac{y_i-\theta x_i}{\sigma_i} \right)^2 \; . \]
 To estimate the minimum we again take the first derivative with
 respect to $\theta$ and equate it to zero:
@@ -189,14 +185,14 @@ respect to $\theta$ and equate it to zero:
 \Leftrightarrow \quad  \theta \sum_{i=1}^n \frac{x_i^2}{\sigma_i^2} & = & \sum_{i=1}^n \frac{x_iy_i}{\sigma_i^2} \nonumber \\
 \Leftrightarrow \quad  \theta & = & \frac{\sum_{i=1}^n \frac{x_iy_i}{\sigma_i^2}}{ \sum_{i=1}^n \frac{x_i^2}{\sigma_i^2}} \label{mleslope}
 \end{eqnarray}
-With this we obtained an analytical expression for the estimation of
-the slope $\theta$ of the regression line (\figref{mleproplinefig}).
-
-A gradient descent, as we have done in the previous chapter, is thus
-not necessary for the fitting of the slope of a linear equation. This
-holds even more generally for fitting the coefficients of linearly
-combined basis functions as for example the fitting of the slope $m$
-and the y-intercept $b$ of the linear equation
+This is an analytical expression for the estimation of the slope
+$\theta$ of the regression line (\figref{mleproplinefig}).
+
+A gradient descent, as we have done in the previous chapter, is not
+necessary for fitting the slope of a straight line. More generally,
+this is the case also for fitting the coefficients of linearly
+combined basis functions as for example the slope $m$ and the
+y-intercept $b$ of a straight line
 \[ y = m \cdot x +b \]
 or, more generally, the coefficients $a_k$ of a polynom
 \[ y = \sum_{k=0}^N a_k x^k = a_o + a_1x + a_2x^2 + a_3x^4 + \ldots \]
@@ -219,14 +215,14 @@ of a probability density function (e.g. the shape parameter of a
 A first guess could be to fit the probability density by minimization
 of the squared difference to a histogram of the measured data. For
 several reasons this is, however, not the method of choice: (i)
-probability densities can only be positive which leads, for small
-values in particular, to asymmetric distributions. (ii) the values of
-a histogram are not independent because the integral of a density is
-unity. The two basic assumptions of normally distributed and
-independent samples, which are a prerequisite make the minimization of
-the squared difference \eqnref{chisqmin} to a maximum likelihood
-estimation, are violated. (iii) The histogram strongly depends on the
-chosen bin size \figref{mlepdffig}).
+Probability densities can only be positive which leads, for small
+values in particular, to asymmetric distributions. (ii) The values of
+a histogram estimating the density are not independent because the
+integral over a density is unity. The two basic assumptions of
+normally distributed and independent samples, which are a prerequisite
+make the minimization of the squared difference \eqnref{chisqmin} to a
+maximum likelihood estimation, are violated. (iii) The histogram
+strongly depends on the chosen bin size \figref{mlepdffig}).
 
 \begin{figure}[t]
   \includegraphics[width=1\textwidth]{mlepdf}
@@ -241,16 +237,16 @@ chosen bin size \figref{mlepdffig}).
 \end{figure}
 
 
-Using the example of the estimating the mean value of a normal
+Using the example of estimating the mean value of a normal
 distribution we have discussed the direct approach to fit a
 probability density to data via maximum likelihood. We simply search
 for the parameter $\theta$ of the desired probability density function
-that maximizes the log-likelihood. This is a non-linear optimization
-problem that is generally solved with numerical methods such as the
-gradient descent \matlabfun{mle()}.
+that maximizes the log-likelihood. In general this is a non-linear
+optimization problem that is generally solved with numerical methods
+such as the gradient descent \matlabfun{mle()}.
 
 \begin{exercise}{mlegammafit.m}{mlegammafit.out}
-  Create a sample of gamma-distributed random number and apply the
+  Generate a sample of gamma-distributed random numbers and apply the
   maximum likelihood method to estimate the parameters of the gamma
   function from the data.
   \pagebreak
@@ -259,16 +255,16 @@ gradient descent \matlabfun{mle()}.
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{Neural coding}
-In sensory systems certain aspects of the surrounding are encoded in
+In sensory systems certain aspects of the environment are encoded in
 the neuronal activity of populations of neurons. One example of such
-population coding is the tuning of neurons in the primary visual
+a population code is the tuning of neurons in the primary visual
 cortex (V1) to the orientation of a visual stimulus. Different neurons
 respond best to different stimulus orientations. Traditionally, such a
 tuning is measured by analyzing the neuronal response strength
 (e.g. the firing rate) as a function of the orientation of the visual
 stimulus and is depicted and summarized with the so called
-\enterm{tuning-curve}(German \determ{Abstimmkurve},
-figure~\ref{mlecoding}, top).
+\enterm{tuning-curve} (\determ{Abstimmkurve},
+figure~\ref{mlecodingfig}, top).
 
 \begin{figure}[tp]
   \includegraphics[width=1\textwidth]{mlecoding}
@@ -278,7 +274,7 @@ figure~\ref{mlecoding}, top).
     dark bar in front of a white background). The stimulus that evokes
     the strongest activity in that neuron is the bar with the vertical
     orientation (arrow, $\phi_i=90$\,\degree). The red area indicates
-    the variability of the neuronal activity $p(r)$ around the tunig
+    the variability of the neuronal activity $p(r|\phi)$ around the tunig
     curve. Center: In a population of neurons, each neuron may have a
     different tuning curve (colors). A specific stimulus (the vertical
     bar) activates the individual neurons of the population in a
@@ -287,10 +283,10 @@ figure~\ref{mlecoding}, top).
 \end{figure}
 
 The brain, however, is confronted with the inverse problem: given a
-certain activity pattern in the neuronal population, what was the
+certain activity pattern in the neuronal population, what is the
 stimulus? In the sense of maximum likelihood, a possible answer to
-this question would be: It was the stimulus for which the particular
-activity pattern is most likely.
+this question would be: the stimulus for which the particular
+activity pattern is most likely given the tuning of the neurons.
 
 Let's stay with the example of the orientation tuning in V1. The
 tuning $\Omega_i(\phi)$ of the neurons $i$ to the preferred stimulus
@@ -298,15 +294,12 @@ orientation $\phi_i$ can be well described using a van-Mises function
 (the Gaussian function on a cyclic x-axis) (\figref{mlecodingfig}):
 \[ \Omega_i(\phi) = c \cdot e^{\cos(2(\phi-\phi_i))} \quad , \quad c
 \in \reZ \]
-The neuronal activity is approximated with a normal
-distribution around the tuning curve with a standard deviation
-$\sigma=\Omega/4$ which is proprotional to $\Omega$ such that the
-probability $p_i(r|\phi)$ of the $i$-th neuron showing the activity
-$r$ given a certain orientation $\phi$ is given by
-
+If we approximate the neuronal activity by a normal distribution
+around the tuning curve with a standard deviation $\sigma=\Omega/4$,
+which is proprotional to $\Omega$, then the probability
+$p_i(r|\phi)$ of the $i$-th neuron showing the activity $r$ given a
+certain orientation $\phi$ is given by
 \[ p_i(r|\phi) = \frac{1}{\sqrt{2\pi}\Omega_i(\phi)/4} e^{-\frac{1}{2}\left(\frac{r-\Omega_i(\phi)}{\Omega_i(\phi)/4}\right)^2} \; . \]
 The log-likelihood of the stimulus orientation $\phi$ given the
 activity pattern in the population $r_1$, $r_2$, ... $r_n$ is thus
 \[ {\cal L}(\phi|r_1, r_2, \ldots r_n) = \sum_{i=1}^n \log p_i(r_i|\phi) \]
-
-\selectlanguage{english}
diff --git a/likelihood/lecture/mlepdf.py b/likelihood/lecture/mlepdf.py
index 22f89bf..54ffd4c 100644
--- a/likelihood/lecture/mlepdf.py
+++ b/likelihood/lecture/mlepdf.py
@@ -36,7 +36,10 @@ ax.set_xlabel('x')
 ax.set_ylabel('Probability density')
 ax.plot(xx, yy, '-', lw=5, color='#ff0000', label='pdf')
 ax.plot(xx, yf, '-', lw=2, color='#ffcc00', label='mle')
-ax.scatter(xd, 0.025*rng.rand(len(xd))+0.05, s=30, zorder=10)
+kernel = st.gaussian_kde(xd)
+x = kernel(xd)
+x /= np.max(x)
+ax.scatter(xd, 0.05*x*(rng.rand(len(xd))-0.5)+0.05, s=30, zorder=10)
 ax.legend(loc='upper right', frameon=False)
 
 # histogram: