From fdd4706a51a7ab80e97255383abe602f5b6bad15 Mon Sep 17 00:00:00 2001 From: Jan Benda Date: Sat, 30 Nov 2019 20:03:18 +0100 Subject: [PATCH] [likelihood] regression slope equals correlation coefficient --- likelihood/lecture/likelihood.tex | 48 ++++++++++++++++++++---- statistics/material/varianceexplained.py | 40 ++++++++++++++++++++ 2 files changed, 80 insertions(+), 8 deletions(-) create mode 100644 statistics/material/varianceexplained.py diff --git a/likelihood/lecture/likelihood.tex b/likelihood/lecture/likelihood.tex index 0579409..91680f3 100644 --- a/likelihood/lecture/likelihood.tex +++ b/likelihood/lecture/likelihood.tex @@ -181,20 +181,21 @@ respect to $\theta$ and equate it to zero: \frac{\text{d}}{\text{d}\theta}\chi^2 & = & \frac{\text{d}}{\text{d}\theta} \sum_{i=1}^n \left( \frac{y_i-\theta x_i}{\sigma_i} \right)^2 \nonumber \\ & = & \sum_{i=1}^n \frac{\text{d}}{\text{d}\theta} \left( \frac{y_i-\theta x_i}{\sigma_i} \right)^2 \nonumber \\ & = & -2 \sum_{i=1}^n \frac{x_i}{\sigma_i} \left( \frac{y_i-\theta x_i}{\sigma_i} \right) \nonumber \\ - & = & -2 \sum_{i=1}^n \left( \frac{x_iy_i}{\sigma_i^2} - \theta \frac{x_i^2}{\sigma_i^2} \right) \;\; = \;\; 0 \nonumber \\ -\Leftrightarrow \quad \theta \sum_{i=1}^n \frac{x_i^2}{\sigma_i^2} & = & \sum_{i=1}^n \frac{x_iy_i}{\sigma_i^2} \nonumber \\ -\Leftrightarrow \quad \theta & = & \frac{\sum_{i=1}^n \frac{x_iy_i}{\sigma_i^2}}{ \sum_{i=1}^n \frac{x_i^2}{\sigma_i^2}} \label{mleslope} + & = & -2 \sum_{i=1}^n \left( \frac{x_i y_i}{\sigma_i^2} - \theta \frac{x_i^2}{\sigma_i^2} \right) \;\; = \;\; 0 \nonumber \\ +\Leftrightarrow \quad \theta \sum_{i=1}^n \frac{x_i^2}{\sigma_i^2} & = & \sum_{i=1}^n \frac{x_i y_i}{\sigma_i^2} \nonumber \\ +\Leftrightarrow \quad \theta & = & \frac{\sum_{i=1}^n \frac{x_i y_i}{\sigma_i^2}}{ \sum_{i=1}^n \frac{x_i^2}{\sigma_i^2}} \label{mleslope} \end{eqnarray} This is an analytical expression for the estimation of the slope $\theta$ of the regression line (\figref{mleproplinefig}). A gradient descent, as we have done in the previous chapter, is not -necessary for fitting the slope of a straight line. More generally, -this is the case also for fitting the coefficients of linearly -combined basis functions as for example the slope $m$ and the -y-intercept $b$ of a straight line +necessary for fitting the slope of a straight line, because the slope +can be directly computed via \eqnref{nleslope}. More generally, this +is the case also for fitting the coefficients of linearly combined +basis functions as for example the slope $m$ and the y-intercept $b$ +of a straight line \[ y = m \cdot x +b \] -or, more generally, the coefficients $a_k$ of a polynom +or the coefficients $a_k$ of a polynom \[ y = \sum_{k=0}^N a_k x^k = a_o + a_1x + a_2x^2 + a_3x^4 + \ldots \] \matlabfun{polyfit()}. @@ -205,6 +206,37 @@ exponential decay Such cases require numerical solutions for the optimization of the cost function, e.g. the gradient descent \matlabfun{lsqcurvefit()}. +Let us have a closer look on \eqnref{mleslope}. If the standard +deviation of the data $\sigma_i$ is the same for each data point, +i.e. $\sigma_i = \sigma_j \; \forall \; i, j$, the standard deviation drops +out of \eqnref{mleslope} and we get +\begin{equation} + \label{whitemleslope} + \theta = \frac{\sum_{i=1}^n x_i y_i}{\sum_{i=1}^n x_i^2} +\end{equation} +To see what this expression is, we need to standardize the data. We +make the data mean free and normalize them to their standard +deviation, i.e. $x \mapsto (x - \bar x)/\sigma_x$. The resulting +numbers are also called \enterm{$z$-values} or $z$-scores and they +have the property $\bar x = 0$ and $\sigma_x = 1$. If this is the +case, the variance +\[ \sigma_x^2 = \frac{1}{n} \sum_{i=1}^n (x_i - \bar x)^2 = \frac{1}{n} \sum_{i=1}^n x_i^2 = 1 \] +is the mean squared data and equals one. +The covariance between $x$ and $y$ also simplifies to +\[ \text{cov}(x, y) = \frac{1}{n} \sum_{i=1}^n (x_i - \bar x)(y_i - \bar y) =\frac{1}{n} \sum_{i=1}^n x_i y_i \] +the averaged product between pairs of $x$ and $y$ values. +Recall that the correlation coefficient $r_{x,y}$ is the covariance +normalized by the product of the standard deviations of $x$ and $y$, +respectively. Therefore, in case the standard deviations equal one, the +correlation coefficient equals the covariance. +Consequently, for standardized data the slope of the regression line +\eqnref{whitemleslope} simplifies to +\begin{equation} + \theta = \frac{1}{n} \sum_{i=1}^n x_i y_i = \text{cov}(x,y) = r_{x,y} \] +\end{equation} +For standardized data the slope of the regression line equals the +correlation coefficient! + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Fitting probability distributions} diff --git a/statistics/material/varianceexplained.py b/statistics/material/varianceexplained.py new file mode 100644 index 0000000..296e00f --- /dev/null +++ b/statistics/material/varianceexplained.py @@ -0,0 +1,40 @@ +import numpy as np +import scipy.stats as st +import matplotlib.pyplot as plt + +x = np.arange(0.0, 10.0, 0.5) +ytrue = 2.0*x +y = ytrue + 5.0*np.random.randn(len(x)) +z = np.polyfit(x, y, 1) +p = np.poly1d(z) + +r, _ = st.pearsonr(x, y) + +print('total variance: %g' % np.var(y)) +print('residual variance: %g' % np.var(y-p(x))) +print('variance explained: %g' % (1.0-np.var(y-p(x))/np.var(y))) +print('r: %g' % r) +print('variance explained: %g' % (r*r)) +print('slope: %g' % z[0]) + +fig, axs = plt.subplots(1, 2, sharex=True, sharey=True) +axs[0].plot(x, ytrue, 'r') +axs[0].scatter(x, y, c='b') +axs[0].plot(x, p(x), 'b') +axs[1].scatter(x, y-p(x), c='b') +#plt.show() + +xn = (x - np.mean(x))/np.std(x) +yn = (y - np.mean(y))/np.std(y) +z = np.polyfit(xn, yn, 1) +p = np.poly1d(z) +r, _ = st.pearsonr(xn, yn) + +print('r: %g' % r) +print('slope: %g' % z[0]) + +fig, ax = plt.subplots() +ax.scatter(xn, yn, c='b') +ax.plot(xn, p(xn), 'b') +plt.show() +