diff --git a/likelihood/lecture/likelihood-chapter.tex b/likelihood/lecture/likelihood-chapter.tex index 736d9eb..40021f4 100644 --- a/likelihood/lecture/likelihood-chapter.tex +++ b/likelihood/lecture/likelihood-chapter.tex @@ -20,12 +20,11 @@ \section{TODO} \begin{itemize} -\item Fitting psychometric functions: +\item Fitting psychometric functions, logistic regression: Variable $x_i$, responses $r_i$ either 0 or 1. - $p(x_i, \theta)$ is Weibull or Boltzmann function. + $p(x_i, \theta)$ is Weibull or Boltzmann or logistic function. Likelihood is $L = \prod p(x_i, \theta)^{r_i} (1-p(x_i, \theta))^{1-r_i}$. - Use fminsearch for fitting. -\item GLM model fitting? + Use fminsearch for fitting? \end{itemize} \end{document} diff --git a/likelihood/lecture/likelihood.tex b/likelihood/lecture/likelihood.tex index cae9ff0..ea0cfd4 100644 --- a/likelihood/lecture/likelihood.tex +++ b/likelihood/lecture/likelihood.tex @@ -7,15 +7,15 @@ The core task of statistics is to infer from measured data some parameters describing the data. These parameters can be simply a mean, a standard deviation, or any other parameter needed to describe the -distribution the data a re originating from, a correlation -coefficient, or some parameters of a function describing a particular -dependence between the data. The brain faces exactly the same -problem. Given the activity pattern of some neurons (the data) it -needs to infer some aspects (parameters) of the environment and the -internal state of the body in order to generate some useful -behavior. One possible approach to estimate parameters from data are -\enterm[maximum likelihood estimator]{maximum likelihood estimators} -(\enterm[mle|see{maximum likelihood estimator}]{mle}, +distribution the data are originating from, a correlation coefficient, +or some parameters of a function describing a particular dependence +between the data. The brain faces exactly the same problem. Given the +activity pattern of some neurons (the data) it needs to infer some +aspects (parameters) of the environment and the internal state of the +body in order to generate some useful behavior. One possible approach +to estimate parameters from data are \enterm[maximum likelihood +estimator]{maximum likelihood estimators} (\enterm[mle|see{maximum + likelihood estimator}]{mle}, \determ{Maximum-Likelihood-Sch\"atzer}). They choose the parameters such that they maximize the likelihood of the specific data values to originate from a specific distribution. @@ -124,7 +124,7 @@ and set it to zero: \Leftrightarrow \quad \sum_{i=1}^n - \frac{2(x_i-\mu)}{2\sigma^2} & = & 0 \nonumber \\ \Leftrightarrow \quad \sum_{i=1}^n x_i - \sum_{i=1}^n \mu & = & 0 \nonumber \\ \Leftrightarrow \quad n \mu & = & \sum_{i=1}^n x_i \nonumber \\ - \Leftrightarrow \quad \mu & = & \frac{1}{n} \sum_{i=1}^n x_i \;\; = \;\; \bar x + \Leftrightarrow \quad \mu & = & \frac{1}{n} \sum_{i=1}^n x_i \;\; = \;\; \bar x \label{arithmeticmean} \end{eqnarray} Thus, the maximum likelihood estimator of the population mean of normally distributed data is the arithmetic mean. That is, the @@ -173,28 +173,28 @@ For non-Gaussian distributions, for example a \entermde[distribution!Gamma-]{Verteilung!Gamma-}{Gamma-distribution} \begin{equation} \label{gammapdf} - p(x|\alpha,\beta) \sim x^{\alpha-1}e^{-\beta x} \; , + p(x|\alpha,\beta) \sim x^{\alpha-1}e^{-\beta x} \end{equation} -however, such simple analytical expressions for the parameters of the -distribution do not exist. This is the case, for example, for the -shape parameter $\alpha$ of the Gamma-distribution. How do we fit such -a distribution to some data? That is, how should we compute the -values of the parameters of the distribution, given the data? - -A first guess could be to fit the probability density function by -minimization of the squared difference to a histogram of the measured -data in the same way as we fit a a function to some data. For several -reasons this is, however, not the method of choice: (i) Probability -densities can only be positive which leads, for small values in -particular, to asymmetric distributions of the estimated histogram -around the true density. (ii) The values of a histogram estimating the -density are not independent because the integral over a density is -unity. The two basic assumptions of normally distributed and -independent samples, which are a prerequisite for making the -minimization of the squared difference to a maximum likelihood -estimation (see next section), are violated. (iii) The estimation of -the probability density by means of a histogram strongly depends on -the chosen bin size \figref{mlepdffig}). +with a shape parameter $\alpha$ and a rate parameter $\beta$ +(\figrefb{mlepdffig}), in general no such simple analytical +expressions for estimating the parameters directly from the data do +not exist. How do we fit such a distribution to the data? That is, +how should we compute the values of the parameters of the +distribution, given the data? + +A first guess could be to fit the probability density function by a +\enterm{least squares} fit to a normalized histogram of the measured data in +the same way as we fit a function to some data. For several reasons +this is, however, not the method of choice: (i) Probability densities +can only be positive which leads, in particular for small values, to +asymmetric distributions of the estimated histogram around the true +density. (ii) The values of a histogram estimating the density are not +independent because the integral over a density is unity. The two +basic assumptions of normally distributed and independent samples, +which are a prerequisite for making the minimization of the squared +difference to a maximum likelihood estimation (see next section), are +violated. (iii) The estimation of the probability density by means of +a histogram strongly depends on the chosen bin size. \begin{figure}[t] \includegraphics[width=1\textwidth]{mlepdf} @@ -203,9 +203,9 @@ the chosen bin size \figref{mlepdffig}). order Gamma-distribution. The maximum likelihood estimation of the probability density function is shown in orange, the true pdf is shown in red. Right: normalized histogram of the data together - with the real (red) and the fitted probability density - functions. The fit was done by minimizing the squared difference - to the histogram.} + with the true probability density (red) and the probability + density function obtained by a least squares fit to the + histogram.} \end{figure} Instead we should stay with maximum-likelihood estimation. Exactly in @@ -228,13 +228,13 @@ numerical methods such as the gradient descent \matlabfun{mle()}. \section{Curve fitting} When fitting a function of the form $f(x;\theta)$ to data pairs -$(x_i|y_i)$ one tries to adapt the parameter $\theta$ such that the +$(x_i|y_i)$ one tries to adapt the parameters $\theta$ such that the function best describes the data. In chapter~\ref{gradientdescentchapter} we simply assumed that ``best'' means minimizing the squared distance between the data and the -function. With maximum likelihood we search for the parameter value -$\theta$ for which the likelihood that the data were drawn from the -corresponding function is maximal. +function. With maximum likelihood we search for those parameter +values $\theta$ that maximize the likelihood of the data to be drawn +from the corresponding function. If we assume that the $y_i$ values are normally distributed around the function values $f(x_i;\theta)$ with a standard deviation $\sigma_i$, @@ -242,35 +242,49 @@ the log-likelihood is \begin{eqnarray} \log {\cal L}(\theta|(x_1,y_1,\sigma_1), \ldots, (x_n,y_n,\sigma_n)) & = & \sum_{i=1}^n \log \frac{1}{\sqrt{2\pi \sigma_i^2}}e^{-\frac{(y_i-f(x_i;\theta))^2}{2\sigma_i^2}} \nonumber \\ - & = & \sum_{i=1}^n - \log \sqrt{2\pi \sigma_i^2} -\frac{(y_i-f(x_i;\theta))^2}{2\sigma_i^2} \\ + & = & \sum_{i=1}^n - \log \sqrt{2\pi \sigma_i^2} -\frac{(y_i-f(x_i;\theta))^2}{2\sigma_i^2} \end{eqnarray} -The only difference to the previous example is that the averages in -the equations above are now given as the function values -$f(x_i;\theta)$. The parameter $\theta$ should be the one that -maximizes the log-likelihood. The first part of the sum is independent -of $\theta$ and can thus be ignored when computing the the maximum: +The only difference to the previous example of the arithmetic mean is +that the means $\mu$ in the equations above are given by the function +values $f(x_i;\theta)$. The parameters $\theta$ should maximize the +log-likelihood. The first term in the sum is independent of $\theta$ +and can be ignored when computing the the maximum. From the second +term we pull out the constant factor $-\frac{1}{2}$: \begin{eqnarray} & = & - \frac{1}{2} \sum_{i=1}^n \left( \frac{y_i-f(x_i;\theta)}{\sigma_i} \right)^2 \end{eqnarray} We can further simplify by inverting the sign and then search for the -minimum. Also the factor $1/2$ can be ignored since it does not affect -the position of the minimum: +minimum. Also the factor $\frac{1}{2}$ can be ignored since it does +not affect the position of the minimum: \begin{equation} \label{chisqmin} \theta_{mle} = \text{argmin}_{\theta} \; \sum_{i=1}^n \left( \frac{y_i-f(x_i;\theta)}{\sigma_i} \right)^2 \;\; = \;\; \text{argmin}_{\theta} \; \chi^2 \end{equation} -The sum of the squared differences normalized by the standard -deviation is also called $\chi^2$ (chi squared). The parameter +The sum of the squared differences between the $y$-data values and the +function values, normalized by the standard deviation of the data +around the function, is called $\chi^2$ (chi squared). The parameter $\theta$ which minimizes the squared differences is thus the one that maximizes the likelihood of the data to actually originate from the -given function. Therefore, minimizing $\chi^2$ is a maximum likelihood -estimation. +given function. + +Whether minimizing $\chi^2$ or the \enterm{mean squared error} +\eqref{meansquarederror} introduced in +chapter~\ref{gradientdescentchapter} does not matter. The latter is +the mean and $\chi^2$ is the sum of the squared differences. They +simply differ by the constant factor $n$, the number of data pairs, +which does not affect the position of the minimum. $\chi^2$ is more +general in that it allows for different standard deviations for each +data pair. If they are all the same ($\sigma_i = \sigma$), the common +standard deviation can be pulled out of the sum and also does not +influence the position of the minimum. Both \enterm{least squares} and +minimizing $\chi^2$ are maximum likelihood estimators of the +parameters $\theta$ of a function. From the mathematical considerations above we can see that the minimization of the squared difference is a maximum-likelihood estimation only if the data are normally distributed around the function. In case of other distributions, the log-likelihood -\eqnref{loglikelihood} needs to be adapted accordingly. +\eqref{loglikelihood} needs to be adapted accordingly. \begin{figure}[t] \includegraphics[width=1\textwidth]{mlepropline} @@ -283,26 +297,32 @@ function. In case of other distributions, the log-likelihood the normal distribution of the data around the line (right histogram).} \end{figure} +Let's go on and calculate the minimum \eqref{chisqmin} of $\chi^2$ +analytically for a simple function. -\subsection{Example: simple proportionality} -The function of a line with slope $\theta$ through the origin is -\[ f(x) = \theta x \; . \] -The $\chi^2$-sum is thus -\[ \chi^2 = \sum_{i=1}^n \left( \frac{y_i-\theta x_i}{\sigma_i} \right)^2 \; . \] -To estimate the minimum we again take the first derivative with -respect to $\theta$ and equate it to zero: -\begin{eqnarray} - \frac{\text{d}}{\text{d}\theta}\chi^2 & = & \frac{\text{d}}{\text{d}\theta} \sum_{i=1}^n \left( \frac{y_i-\theta x_i}{\sigma_i} \right)^2 \nonumber \\ - & = & \sum_{i=1}^n \frac{\text{d}}{\text{d}\theta} \left( \frac{y_i-\theta x_i}{\sigma_i} \right)^2 \nonumber \\ - & = & -2 \sum_{i=1}^n \frac{x_i}{\sigma_i} \left( \frac{y_i-\theta x_i}{\sigma_i} \right) \nonumber \\ - & = & -2 \sum_{i=1}^n \left( \frac{x_i y_i}{\sigma_i^2} - \theta \frac{x_i^2}{\sigma_i^2} \right) \;\; = \;\; 0 \nonumber \\ -\Leftrightarrow \quad \theta \sum_{i=1}^n \frac{x_i^2}{\sigma_i^2} & = & \sum_{i=1}^n \frac{x_i y_i}{\sigma_i^2} \nonumber -\end{eqnarray} + +\subsection{Straight line trough the origin} +The function of a straight line with slope $m$ through the origin +is +\[ f(x) = m x \; . \] +With this specific function, $\chi^2$ reads +\[ \chi^2 = \sum_{i=1}^n \left( \frac{y_i-m x_i}{\sigma_i} \right)^2 +\; . \] To calculate the minimum we take the first derivative with +respect to $m$ and equate it to zero: \begin{eqnarray} -\Leftrightarrow \quad \theta & = & \frac{\sum_{i=1}^n \frac{x_i y_i}{\sigma_i^2}}{ \sum_{i=1}^n \frac{x_i^2}{\sigma_i^2}} \label{mleslope} + \frac{\text{d}}{\text{d}m}\chi^2 & = & \sum_{i=1}^n \frac{\text{d}}{\text{d}m} \left( \frac{y_i-m x_i}{\sigma_i} \right)^2 \nonumber \\ + & = & -2 \sum_{i=1}^n \frac{x_i}{\sigma_i} \left( \frac{y_i-m x_i}{\sigma_i} \right) \nonumber \\ + & = & -2 \sum_{i=1}^n \left( \frac{x_i y_i}{\sigma_i^2} - m \frac{x_i^2}{\sigma_i^2} \right) \;\; = \;\; 0 \nonumber \\ +\Leftrightarrow \quad m \sum_{i=1}^n \frac{x_i^2}{\sigma_i^2} & = & \sum_{i=1}^n \frac{x_i y_i}{\sigma_i^2} \nonumber \\ +\Leftrightarrow \quad m & = & \frac{\sum_{i=1}^n \frac{x_i y_i}{\sigma_i^2}}{ \sum_{i=1}^n \frac{x_i^2}{\sigma_i^2}} \label{mleslope} \end{eqnarray} -This is an analytical expression for the estimation of the slope -$\theta$ of the regression line (\figref{mleproplinefig}). +This is an analytical expression for the estimation of the slope $m$ +of the regression line (\figref{mleproplinefig}). We do not need to +apply numerical methods like the gradient descent to find the slope +that minimizes the squared differences. Instead, we can compute the +slope directly from the data by means of \eqnref{mleslope}, very much +like we calculate the mean of some data by means of the arithmetic +mean \eqref{arithmeticmean}. \subsection{Linear and non-linear fits} A gradient descent, as we have done in the previous chapter, is not @@ -317,47 +337,68 @@ or the coefficients $a_k$ of a polynomial \matlabfun{polyfit()}. Parameters that are non-linearly combined can not be calculated -analytically. Consider for example the rate $\lambda$ of the -exponential decay +analytically. Consider, for example, the factor $c$ and the rate +$\lambda$ of the exponential decay \[ y = c \cdot e^{\lambda x} \quad , \quad c, \lambda \in \reZ \; . \] Such cases require numerical solutions for the optimization of the cost function, e.g. the gradient descent \matlabfun{lsqcurvefit()}. \subsection{Relation between slope and correlation coefficient} -Let us have a closer look on \eqnref{mleslope}. If the standard -deviation of the data $\sigma_i$ is the same for each data point, -i.e. $\sigma_i = \sigma_j \; \forall \; i, j$, the standard deviation drops -out of \eqnref{mleslope} and we get +Let us have a closer look on \eqnref{mleslope} for the slope of a line +through the origin. If the standard deviation of the data $\sigma_i$ +is the same for each data point, i.e. $\sigma_i = \sigma_j \; \forall +\; i, j$, the standard deviation drops out and \eqnref{mleslope} +simplifies to \begin{equation} \label{whitemleslope} - \theta = \frac{\sum_{i=1}^n x_i y_i}{\sum_{i=1}^n x_i^2} + m = \frac{\sum_{i=1}^n x_i y_i}{\sum_{i=1}^n x_i^2} \end{equation} -To see what this expression is, we need to standardize the data. We -make the data mean free and normalize them to their standard -deviation, i.e. $x \mapsto (x - \bar x)/\sigma_x$. The resulting -numbers are also called \entermde[z-values]{z-Wert}{$z$-values} or -$z$-scores and they have the property $\bar x = 0$ and $\sigma_x = -1$. $z$-scores are often used in Biology to make quantities that -differ in their units comparable. For standardized data the variance +To see what the nominator and the denominator of this expression +describe, we need to subtract from the data their mean value. We make +the data mean free, i.e. $x \mapsto x - \bar x$ and $y \mapsto y - +\bar y$ . For mean-free data the variance \begin{equation} - \sigma_x^2 = \frac{1}{n} \sum_{i=1}^n (x_i - \bar x)^2 = \frac{1}{n} \sum_{i=1}^n x_i^2 = 1 + \sigma_x^2 = \frac{1}{n} \sum_{i=1}^n (x_i - \bar x)^2 = \frac{1}{n} \sum_{i=1}^n x_i^2 \end{equation} -is given by the mean squared data and equals one. -The covariance between $x$ and $y$ also simplifies to +is given by the mean squared data. In the same way, the covariance +between $x$ and $y$ simplifies to \begin{equation} \text{cov}(x, y) = \frac{1}{n} \sum_{i=1}^n (x_i - \bar x)(y_i - - \bar y) =\frac{1}{n} \sum_{i=1}^n x_i y_i + \bar y) =\frac{1}{n} \sum_{i=1}^n x_i y_i \; , +\end{equation} +the averaged product between pairs of $x$ and $y$ values. Expanding +the fraction in \eqnref{whitemleslope} by $\frac{1}{n}$ we get +\begin{equation} + \label{meanfreeslope} + m = \frac{\frac{1}{n}\sum_{i=1}^n x_i y_i}{\frac{1}{n}\sum_{i=1}^n x_i^2} + = \frac{\text{cov}(x, y)}{\sigma_x^2} \end{equation} -the averaged product between pairs of $x$ and $y$ values. Recall that -the correlation coefficient $r_{x,y}$, -\eqnref{correlationcoefficient}, is the covariance normalized by the -product of the standard deviations of $x$ and $y$, -respectively. Therefore, in case the standard deviations equal one, -the correlation coefficient is identical to the covariance. -Consequently, for standardized data the slope of the regression line -\eqnref{whitemleslope} simplifies to + +Recall that the correlation coefficient $r_{x,y}$ is the covariance +normalized by the product of the standard deviations of $x$ and $y$: +\begin{equation} + \label{meanfreecorrcoef} + r_{x,y} = \frac{\text{cov}(x, y)}{\sigma_x\sigma_y} +\end{equation} +If furthermore the standard deviations of $x$ and $y$ are the same, +i.e. $\sigma_x = \sigma_y$, the slope of a line trough the origin is +identical to the correlation coefficient. + +This relation between slope and correlation coefficient in particular +holds for standardized data that have been made mean free and have +been normalized by their standard deviation, i.e. $x \mapsto (x - \bar +x)/\sigma_x$ and $y \mapsto (y - \bar x)/\sigma_y$. The resulting +numbers are called \entermde[z-value]{z-Wert}{$z$-values} or +\enterm[z-score]{$z$-scores}. Their mean equals zero and their +standard deviation one. $z$-scores are often used to make quantities +that differ in their units comparable. For standardized data the +denominators for both the slope \eqref{meanfreeslope} and the +correlation coefficient \eqref{meanfreecorrcoef} equal one. For +standardized data, both the slope of the regression line and the +corelation coefficient reduce to the covariance between the $x$ and +$y$ data: \begin{equation} - \theta = \frac{1}{n} \sum_{i=1}^n x_i y_i = \text{cov}(x,y) = r_{x,y} + m = \frac{1}{n} \sum_{i=1}^n x_i y_i = \text{cov}(x,y) = r_{x,y} \end{equation} For standardized data the slope of the regression line is the same as the correlation coefficient! @@ -365,63 +406,100 @@ correlation coefficient! %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Neural coding} +Maximum likelihood estimators are not only a central concept for data +analysis. Neural systems face the very same problem. They also need to +estimate parameters of the internal and external environment based on +the activity of neurons. + In sensory systems certain aspects of the environment are encoded in the neuronal activity of populations of neurons. One example of such a population code is the tuning of neurons in the primary visual cortex -(V1) to the orientation of an edge or bar in the visual -stimulus. Different neurons respond best to different edge -orientations. Traditionally, such a tuning is measured by analyzing -the neuronal response strength (e.g. the firing rate) as a function of -the orientation of a black bar and is illustrated and summarized -with the so called \enterm{tuning-curve} (\determ{Abstimmkurve}, +(V1) to the orientation of a bar in the visual stimulus. Different +neurons respond best to different bar orientations. Traditionally, +such a tuning is measured by analyzing the neuronal response strength +(e.g. the firing rate) as a function of the orientation of a black bar +and is illustrated and summarized with the so called +\enterm{tuning-curve} (\determ{Abstimmkurve}, figure~\ref{mlecodingfig}, top). \begin{figure}[tp] \includegraphics[width=1\textwidth]{mlecoding} \titlecaption{\label{mlecodingfig} Maximum likelihood estimation of - a stimulus parameter from neuronal activity.}{Top: Tuning curve of - an individual neuron as a function of the stimulus orientation (a - dark bar in front of a white background). The stimulus that evokes - the strongest activity in that neuron is the bar with the vertical - orientation (arrow, $\phi_i=90$\,\degree). The red area indicates - the variability of the neuronal activity $p(r|\phi)$ around the - tuning curve. Center: In a population of neurons, each neuron may - have a different tuning curve (colors). A specific stimulus (the - vertical bar) activates the individual neurons of the population - in a specific way (dots). Bottom: The log-likelihood of the - activity pattern will be maximized close to the real stimulus + a stimulus parameter from neuronal activity.}{Top: Tuning curve + $r_i(\phi;c,\phi_i)$ of a specific neuron $i$ as a function of the + orientation $\phi$ of a stimulus, a dark bar in front of a white + background. The preferred stimulus $\phi_i$ of that neuron, the + one that evokes the strongest firing rate response, is a bar with + vertical orientation (arrow, $\phi_i=90$\,\degree). The width of + the red area indicates the variability of the neuronal activity + $\sigma_i$ around the tuning curve. Center: In a population of + neurons, each neuron may have a different tuning curve (colors). A + specific stimulus activates the individual neurons of the + population in a specific way (dots). Bottom: The log-likelihood of + the activity pattern has a maximum close to the real stimulus orientation.} \end{figure} The brain, however, is confronted with the inverse problem: given a certain activity pattern in the neuronal population, what is the -stimulus (here the orientation of an edge)? In the sense of maximum -likelihood, a possible answer to this question would be: the stimulus -for which the particular activity pattern is most likely given the -tuning of the neurons. +stimulus? In our example, what is the orientation of the bar? In the +sense of maximum likelihood, a possible answer to this question would +be: the stimulus for which the particular activity pattern is most +likely given the tuning of the neurons and the noise (standard +deviation) of the responses. Let's stay with the example of the orientation tuning in V1. The -tuning $\Omega_i(\phi)$ of the neurons $i$ to the preferred edge -orientation $\phi_i$ can be well described using a van-Mises function -(the Gaussian function on a cyclic x-axis) (\figref{mlecodingfig}): +tuning of the firing rate $r_i(\phi)$ of neuron $i$ to the preferred +bar orientation $\phi_i$ can be well described using a van-Mises +function (the Gaussian function on a cyclic x-axis) +(\figref{mlecodingfig}): \begin{equation} - \Omega_i(\phi) = c \cdot e^{\cos(2(\phi-\phi_i))} \quad , \quad c \in \reZ + \label{bartuningcurve} + r_i(\phi; c, \phi_i) = c \cdot e^{\cos(2(\phi-\phi_i))} \quad , \quad c > 0 \end{equation} +Note the factor two in the cosine, because the response of the neuron +is the same for a bar turned by 180\,\degree. + If we approximate the neuronal activity by a normal distribution -around the tuning curve with a standard deviation $\sigma=\Omega/4$, -which is proportional to $\Omega$, then the probability $p_i(r|\phi)$ -of the $i$-th neuron showing the activity $r$ given a certain -orientation $\phi$ of an edge is given by +around the tuning curve with a standard deviation $\sigma_i$, then the +probability $p_i(r|\phi)$ of the $i$-th neuron having the observed +activity $r$, given a certain orientation $\phi$ of a bar is \begin{equation} - p_i(r|\phi) = \frac{1}{\sqrt{2\pi}\Omega_i(\phi)/4} e^{-\frac{1}{2}\left(\frac{r-\Omega_i(\phi)}{\Omega_i(\phi)/4}\right)^2} \; . + p_i(r|\phi) = \frac{1}{\sqrt{2\pi\sigma_i^2}} e^{-\frac{1}{2}\left(\frac{r-r_i(\phi; c, \phi_i)}{\sigma_i}\right)^2} \; . \end{equation} -The log-likelihood of the edge orientation $\phi$ given the +The log-likelihood of the bar orientation $\phi$ given the activity pattern in the population $r_1$, $r_2$, ... $r_n$ is thus \begin{equation} {\cal L}(\phi|r_1, r_2, \ldots, r_n) = \sum_{i=1}^n \log p_i(r_i|\phi) \end{equation} -The angle $\phi$ that maximizes this likelihood is then an estimate of -the orientation of the edge. +The angle $\phi_{mle}$ that maximizes this likelihood is an estimate +of the true orientation of the bar (\figref{mlecodingfig}). + +The noisiness of the neuron's responses as quantified by $\sigma_i$ +usually is a function of the neuron's mean firing rate $r_i$, +\eqnref{bartuningcurve}: $\sigma_i = \sigma_i(r_i)$. This dependence +has a major impact of the maximum likelihood estimation. Usually, the +stronger the response of a neuron, the higher its firing rate, the +lower the noise. In this case, strong responses will have a stronger +influence on the position of the maximum of the log-likelihood. + +Whether neural systems really implement maximum likelihood estimators +is another question. There are many ways how a stimulus property can +be read out from the activity of a population of neurons. The simplest +one being a ``winner takes all'' rule. The preferred stimulus +parameter of the neuron with the strongest response is the +estimate. Another possibility is to compute a population vector. The +estimated stimulus parameter is the sum of the preferred stimulus +parameters of all neurons in the population weighted by the activity +of the neurons. In case of angular stimulus parameters, like the +orientation of the bar in our example, a vector pointing in the +direction of the angle is used instead of the angle to incorporate the +cyclic nature of the parameter. + +Using maximum likelihood estimators for analyzing neural population +activity gives us an upper bound of how well a stimulus parameter is +encoded in the activity of the neurons. The brain would not be able to +do better. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% diff --git a/likelihood/lecture/mlecoding.py b/likelihood/lecture/mlecoding.py index 3edd5a5..8be385f 100644 --- a/likelihood/lecture/mlecoding.py +++ b/likelihood/lecture/mlecoding.py @@ -5,11 +5,9 @@ import matplotlib.gridspec as gridspec from plotstyle import * rng = np.random.RandomState(4637281) -lmarg=0.1 -rmarg=0.1 fig = plt.figure(figsize=cm_size(figure_width, 2.8*figure_height)) -spec = gridspec.GridSpec(nrows=4, ncols=1, height_ratios=[4, 4, 1, 3], hspace=0.2, +spec = gridspec.GridSpec(nrows=4, ncols=1, height_ratios=[4, 5, 1, 3], hspace=0.2, **adjust_fs(fig, left=4.0)) ax = fig.add_subplot(spec[0, 0]) ax.set_xlim(0.0, np.pi) @@ -17,7 +15,7 @@ ax.set_xticks(np.arange(0.125*np.pi, 1.*np.pi, 0.125*np.pi)) ax.set_xticklabels([]) ax.set_ylim(0.0, 3.5) ax.yaxis.set_major_locator(plt.NullLocator()) -ax.text(-0.2, 0.5*3.5, 'Activity', rotation='vertical', va='center') +ax.text(-0.2, 0.5*3.5, 'Firing rate', rotation='vertical', va='center') ax.annotate('Tuning curve', xy=(0.42*np.pi, 2.5), xycoords='data', xytext=(0.3*np.pi, 3.2), textcoords='data', ha='right', @@ -32,27 +30,34 @@ ax.text(0.52*np.pi, 0.7, 'preferred\norientation') xx = np.arange(0.0, 2.0*np.pi, 0.01) pp = 0.5*np.pi yy = np.exp(np.cos(2.0*(xx+pp))) -ax.fill_between(xx, yy+0.25*yy, yy-0.25*yy, **fsBa) +ss = 1.0/(0.75+2.0*yy) +ax.fill_between(xx, yy+ss, yy-ss, **fsBa) ax.plot(xx, yy, **lsB) ax = fig.add_subplot(spec[1, 0]) ax.set_xlim(0.0, np.pi) ax.set_xticks(np.arange(0.125*np.pi, 1.*np.pi, 0.125*np.pi)) ax.set_xticklabels([]) -ax.set_ylim(0.0, 3.0) +ax.set_ylim(-0.1, 3.5) ax.yaxis.set_major_locator(plt.NullLocator()) -ax.text(-0.2, 0.5*3.5, 'Activity', rotation='vertical', va='center') +ax.text(-0.2, 0.5*3.5, 'Firing rate', rotation='vertical', va='center') xx = np.arange(0.0, 1.0*np.pi, 0.01) prefphases = np.arange(0.125*np.pi, 1.*np.pi, 0.125*np.pi) responses = [] -xresponse = 0.475*np.pi +sigmas = [] +xresponse = 0.41*np.pi +ax.annotate('Orientation of bar', + xy=(xresponse, -0.1), xycoords='data', + xytext=(xresponse, 3.1), textcoords='data', ha='left', zorder=-10, + arrowprops=dict(arrowstyle="->", relpos=(0.0,0.0)) ) for pp, ls, ps in zip(prefphases, [lsE, lsC, lsD, lsB, lsD, lsC, lsE], [psE, psC, psD, psB, psD, psC, psE]) : yy = np.exp(np.cos(2.0*(xx+pp))) - #ax.plot(xx, yy, color=cm.autumn(2.0*np.abs(pp/np.pi-0.5), 1)) ax.plot(xx, yy, **ls) y = np.exp(np.cos(2.0*(xresponse+pp))) - responses.append(y + rng.randn()*0.25*y) + s = 1.0/(0.75+2.0*y) + responses.append(y + rng.randn()*s) + sigmas.append(s) ax.plot(xresponse, y, **ps) responses = np.array(responses) @@ -68,25 +73,23 @@ ax = fig.add_subplot(spec[3, 0]) ax.set_xlim(0.0, np.pi) ax.set_xticks(np.arange(0.125*np.pi, 1.*np.pi, 0.125*np.pi)) ax.set_xticklabels([]) -ax.set_ylim(-1600, 0) +ax.set_ylim(-210, 0) ax.yaxis.set_major_locator(plt.NullLocator()) ax.set_xlabel('Orientation') -ax.text(-0.2, -800, 'Log-Likelihood', rotation='vertical', va='center') +ax.text(-0.2, -100, 'Log-Likelihood', rotation='vertical', va='center') phases = np.linspace(0.0, 1.1*np.pi, 100) probs = np.zeros((len(responses), len(phases))) -for k, (pp, r) in enumerate(zip(prefphases, responses)) : +for k, (pp, r, sigma) in enumerate(zip(prefphases, responses, sigmas)) : y = np.exp(np.cos(2.0*(phases+pp))) - sigma = 0.1*y - probs[k,:] = np.exp(-0.5*((r-y)/sigma)**2.0)/np.sqrt(2.0*np.pi)/sigma + probs[k,:] = np.exp(-0.5*((r-y)/sigma)**2.0)/np.sqrt(2.0*np.pi*sigma**2) loglikelihood = np.sum(np.log(probs), 0) -maxl = np.max(loglikelihood) maxp = phases[np.argmax(loglikelihood)] ax.annotate('', - xy=(maxp, -1600), xycoords='data', - xytext=(maxp, -30), textcoords='data', + xy=(maxp, -210), xycoords='data', + xytext=(maxp, -10), textcoords='data', arrowprops=dict(arrowstyle="->", relpos=(0.5,0.5), connectionstyle="angle3,angleA=80,angleB=90") ) -ax.text(maxp+0.05, -1100, 'most likely\norientation\ngiven the responses') -ax.plot(phases, loglikelihood, **lsA) +ax.text(maxp+0.05, -150, 'most likely\norientation\ngiven the responses') +ax.plot(phases, loglikelihood, clip_on=False, **lsA) plt.savefig('mlecoding.pdf')