From de7c3dfd103e751d32a54bc788683b4465b07b65 Mon Sep 17 00:00:00 2001
From: Jan Benda <jan.benda@uni-tuebingen.de>
Date: Mon, 16 Dec 2019 10:04:40 +0100
Subject: [PATCH] [likelihood] equation get numbers

---
 likelihood/lecture/likelihood.tex | 71 ++++++++++++++++++-------------
 1 file changed, 42 insertions(+), 29 deletions(-)

diff --git a/likelihood/lecture/likelihood.tex b/likelihood/lecture/likelihood.tex
index dcaafaa..28cc395 100644
--- a/likelihood/lecture/likelihood.tex
+++ b/likelihood/lecture/likelihood.tex
@@ -27,10 +27,10 @@ defined by the mean $\mu$ and the standard deviation $\sigma$ as
 parameters $\theta$.  If the $n$ independent observations of $x_1,
 x_2, \ldots x_n$ originate from the same probability density
 distribution (they are \enterm[i.i.d.|see{independent and identically
-distributed}]{i.i.d.}, \enterm{independent and identically
-distributed}) then the conditional probability $p(x_1,x_2, \ldots
-x_n|\theta)$ of observing $x_1, x_2, \ldots x_n$ given a specific
-$\theta$ is given by
+  distributed}]{i.i.d.}, \enterm{independent and identically
+  distributed}) then the conditional probability $p(x_1,x_2, \ldots
+x_n|\theta)$ of observing $x_1, x_2, \ldots x_n$ given some specific
+parameter values $\theta$ is given by
 \begin{equation}
   p(x_1,x_2, \ldots x_n|\theta) = p(x_1|\theta) \cdot p(x_2|\theta)
   \ldots p(x_n|\theta) = \prod_{i=1}^n p(x_i|\theta) \; .
@@ -50,15 +50,18 @@ parameter values
   \theta_{mle} = \text{argmax}_{\theta} {\cal L}(\theta|x_1,x_2, \ldots x_n)
 \end{equation}
 that maximize the likelihood.  $\text{argmax}_xf(x)$ is the value of
-the argument $x$ of the function $f(x)$ for which the function $f(x)$
-assumes its global maximum. Thus, we search for the value of $\theta$
-at which the likelihood ${\cal L}(\theta)$ reaches its maximum.
+the argument $x$ for which the function $f(x)$ assumes its global
+maximum. Thus, we search for the parameter values $\theta$ at which
+the likelihood ${\cal L}(\theta)$ reaches its maximum. For these
+paraemter values the measured data most likely originated from the
+corresponding distribution.
 
 The position of a function's maximum does not change when the values
 of the function are transformed by a strictly monotonously rising
-function such as the logarithm. For numerical and reasons that we will
-discuss below, we commonly search for the maximum of the logarithm of
-the likelihood (\entermde[likelihood!log-]{Likelihood!Log-}{log-likelihood}):
+function such as the logarithm. For numerical reasons and reasons that
+we will discuss below, we search for the maximum of the logarithm of
+the likelihood
+(\entermde[likelihood!log-]{Likelihood!Log-}{log-likelihood}):
 
 \begin{eqnarray}
   \theta_{mle} & = & \text{argmax}_{\theta}\; {\cal L}(\theta|x_1,x_2, \ldots x_n) \nonumber \\
@@ -87,22 +90,22 @@ maximizes the likelihood of the data?
 \end{figure}
 
 The log-likelihood \eqnref{loglikelihood}
-\begin{eqnarray*}
+\begin{eqnarray}
   \log {\cal L}(\theta|x_1,x_2, \ldots x_n)
-  & = & \sum_{i=1}^n \log \frac{1}{\sqrt{2\pi \sigma^2}}e^{-\frac{(x_i-\theta)^2}{2\sigma^2}} \\
+  & = & \sum_{i=1}^n \log \frac{1}{\sqrt{2\pi \sigma^2}}e^{-\frac{(x_i-\theta)^2}{2\sigma^2}} \nonumber \\
   & = & \sum_{i=1}^n - \log \sqrt{2\pi \sigma^2} -\frac{(x_i-\theta)^2}{2\sigma^2} \; .
-\end{eqnarray*}
+\end{eqnarray}
 Since the logarithm is the inverse function of the exponential
 ($\log(e^x)=x$), taking the logarithm removes the exponential from the
 normal distribution.  To calculate the maximum of the log-likelihood,
 we need to take the derivative with respect to $\theta$ and set it to
 zero:
-\begin{eqnarray*}
-  \frac{\text{d}}{\text{d}\theta} \log {\cal L}(\theta|x_1,x_2, \ldots x_n) & = & \sum_{i=1}^n - \frac{2(x_i-\theta)}{2\sigma^2} \;\; = \;\; 0 \\
-  \Leftrightarrow \quad \sum_{i=1}^n x_i - \sum_{i=1}^n \theta & = & 0 \\
-  \Leftrightarrow \quad n \theta & = & \sum_{i=1}^n x_i \\
+\begin{eqnarray}
+  \frac{\text{d}}{\text{d}\theta} \log {\cal L}(\theta|x_1,x_2, \ldots x_n) & = & \sum_{i=1}^n - \frac{2(x_i-\theta)}{2\sigma^2} \;\; = \;\; 0  \nonumber \\
+  \Leftrightarrow \quad \sum_{i=1}^n x_i - \sum_{i=1}^n \theta & = & 0  \nonumber \\
+  \Leftrightarrow \quad n \theta & = & \sum_{i=1}^n x_i  \nonumber \\
   \Leftrightarrow \quad \theta & = & \frac{1}{n} \sum_{i=1}^n x_i \;\; = \;\; \bar x
-\end{eqnarray*}
+\end{eqnarray}
 Thus, the maximum likelihood estimator is the arithmetic mean. That
 is, the arithmetic mean maximizes the likelihood that the data
 originate from a normal distribution centered at the arithmetic mean
@@ -194,19 +197,19 @@ were drawn from the corresponding function is maximal.  If we assume
 that the $y_i$ values are normally distributed around the function
 values $f(x_i;\theta)$ with a standard deviation $\sigma_i$, the
 log-likelihood is
-\begin{eqnarray*}
+\begin{eqnarray}
   \log {\cal L}(\theta|(x_1,y_1,\sigma_1), \ldots, (x_n,y_n,\sigma_n))
-  & = & \sum_{i=1}^n \log \frac{1}{\sqrt{2\pi \sigma_i^2}}e^{-\frac{(y_i-f(x_i;\theta))^2}{2\sigma_i^2}} \\
+  & = & \sum_{i=1}^n \log \frac{1}{\sqrt{2\pi \sigma_i^2}}e^{-\frac{(y_i-f(x_i;\theta))^2}{2\sigma_i^2}}  \nonumber \\
   & = & \sum_{i=1}^n - \log \sqrt{2\pi \sigma_i^2} -\frac{(y_i-f(x_i;\theta))^2}{2\sigma_i^2} \\
-\end{eqnarray*}
+\end{eqnarray}
 The only difference to the previous example is that the averages in
 the equations above are now given as the function values
 $f(x_i;\theta)$. The parameter $\theta$ should be the one that
 maximizes the log-likelihood. The first part of the sum is independent
 of $\theta$ and can thus be ignored when computing the the maximum:
-\begin{eqnarray*}
+\begin{eqnarray}
   & = & - \frac{1}{2} \sum_{i=1}^n \left( \frac{y_i-f(x_i;\theta)}{\sigma_i} \right)^2
-\end{eqnarray*}
+\end{eqnarray}
 We can further simplify by inverting the sign and then search for the
 minimum. Also the factor $1/2$ can be ignored since it does not affect
 the position of the minimum:
@@ -295,11 +298,15 @@ numbers are also called \entermde[z-values]{z-Wert}{$z$-values} or
 $z$-scores and they have the property $\bar x = 0$ and $\sigma_x =
 1$. $z$-scores are often used in Biology to make quantities that
 differ in their units comparable. For standardized data the variance
-\[ \sigma_x^2 = \frac{1}{n} \sum_{i=1}^n (x_i - \bar x)^2 = \frac{1}{n} \sum_{i=1}^n x_i^2 = 1 \]
+\begin{equation}
+  \sigma_x^2 = \frac{1}{n} \sum_{i=1}^n (x_i - \bar x)^2 = \frac{1}{n} \sum_{i=1}^n x_i^2 = 1
+\end{equation}
 is given by the mean squared data and equals one.
 The covariance between $x$ and $y$ also simplifies to
-\[ \text{cov}(x, y) = \frac{1}{n} \sum_{i=1}^n (x_i - \bar x)(y_i -
-\bar y) =\frac{1}{n} \sum_{i=1}^n x_i y_i \]
+\begin{equation}
+  \text{cov}(x, y) = \frac{1}{n} \sum_{i=1}^n (x_i - \bar x)(y_i -
+  \bar y) =\frac{1}{n} \sum_{i=1}^n x_i y_i
+\end{equation}
 the averaged product between pairs of $x$ and $y$ values.  Recall that
 the correlation coefficient $r_{x,y}$,
 \eqnref{correlationcoefficient}, is the covariance normalized by the
@@ -356,16 +363,22 @@ Let's stay with the example of the orientation tuning in V1. The
 tuning $\Omega_i(\phi)$ of the neurons $i$ to the preferred edge
 orientation $\phi_i$ can be well described using a van-Mises function
 (the Gaussian function on a cyclic x-axis) (\figref{mlecodingfig}):
-\[ \Omega_i(\phi) = c \cdot e^{\cos(2(\phi-\phi_i))} \quad , \quad c \in \reZ \] 
+\begin{equation}
+  \Omega_i(\phi) = c \cdot e^{\cos(2(\phi-\phi_i))} \quad , \quad c \in \reZ
+\end{equation} 
 If we approximate the neuronal activity by a normal distribution
 around the tuning curve with a standard deviation $\sigma=\Omega/4$,
 which is proportional to $\Omega$, then the probability $p_i(r|\phi)$
 of the $i$-th neuron showing the activity $r$ given a certain
 orientation $\phi$ of an edge is given by
-\[ p_i(r|\phi) = \frac{1}{\sqrt{2\pi}\Omega_i(\phi)/4} e^{-\frac{1}{2}\left(\frac{r-\Omega_i(\phi)}{\Omega_i(\phi)/4}\right)^2} \; . \]
+\begin{equation}
+  p_i(r|\phi) = \frac{1}{\sqrt{2\pi}\Omega_i(\phi)/4} e^{-\frac{1}{2}\left(\frac{r-\Omega_i(\phi)}{\Omega_i(\phi)/4}\right)^2} \; .
+\end{equation}
 The log-likelihood of the edge orientation $\phi$ given the
 activity pattern in the population $r_1$, $r_2$, ... $r_n$ is thus
-\[ {\cal L}(\phi|r_1, r_2, \ldots r_n) = \sum_{i=1}^n \log p_i(r_i|\phi) \]
+\begin{equation}
+  {\cal L}(\phi|r_1, r_2, \ldots r_n) = \sum_{i=1}^n \log p_i(r_i|\phi)
+\end{equation}
 The angle $\phi$ that maximizes this likelihood is then an estimate of
 the orientation of the edge.