From fdd4706a51a7ab80e97255383abe602f5b6bad15 Mon Sep 17 00:00:00 2001
From: Jan Benda <jan.benda@uni-tuebingen.de>
Date: Sat, 30 Nov 2019 20:03:18 +0100
Subject: [PATCH] [likelihood] regression slope equals correlation coefficient

---
 likelihood/lecture/likelihood.tex        | 48 ++++++++++++++++++++----
 statistics/material/varianceexplained.py | 40 ++++++++++++++++++++
 2 files changed, 80 insertions(+), 8 deletions(-)
 create mode 100644 statistics/material/varianceexplained.py

diff --git a/likelihood/lecture/likelihood.tex b/likelihood/lecture/likelihood.tex
index 0579409..91680f3 100644
--- a/likelihood/lecture/likelihood.tex
+++ b/likelihood/lecture/likelihood.tex
@@ -181,20 +181,21 @@ respect to $\theta$ and equate it to zero:
   \frac{\text{d}}{\text{d}\theta}\chi^2 & = & \frac{\text{d}}{\text{d}\theta} \sum_{i=1}^n \left( \frac{y_i-\theta x_i}{\sigma_i} \right)^2 \nonumber \\
   & = & \sum_{i=1}^n \frac{\text{d}}{\text{d}\theta} \left( \frac{y_i-\theta x_i}{\sigma_i} \right)^2 \nonumber \\
   & = & -2 \sum_{i=1}^n  \frac{x_i}{\sigma_i} \left( \frac{y_i-\theta x_i}{\sigma_i} \right) \nonumber \\
-  & = & -2 \sum_{i=1}^n \left( \frac{x_iy_i}{\sigma_i^2} - \theta \frac{x_i^2}{\sigma_i^2} \right) \;\; = \;\; 0 \nonumber \\
-\Leftrightarrow \quad  \theta \sum_{i=1}^n \frac{x_i^2}{\sigma_i^2} & = & \sum_{i=1}^n \frac{x_iy_i}{\sigma_i^2} \nonumber \\
-\Leftrightarrow \quad  \theta & = & \frac{\sum_{i=1}^n \frac{x_iy_i}{\sigma_i^2}}{ \sum_{i=1}^n \frac{x_i^2}{\sigma_i^2}} \label{mleslope}
+  & = & -2 \sum_{i=1}^n \left( \frac{x_i y_i}{\sigma_i^2} - \theta \frac{x_i^2}{\sigma_i^2} \right) \;\; = \;\; 0 \nonumber \\
+\Leftrightarrow \quad  \theta \sum_{i=1}^n \frac{x_i^2}{\sigma_i^2} & = & \sum_{i=1}^n \frac{x_i y_i}{\sigma_i^2} \nonumber \\
+\Leftrightarrow \quad  \theta & = & \frac{\sum_{i=1}^n \frac{x_i y_i}{\sigma_i^2}}{ \sum_{i=1}^n \frac{x_i^2}{\sigma_i^2}} \label{mleslope}
 \end{eqnarray}
 This is an analytical expression for the estimation of the slope
 $\theta$ of the regression line (\figref{mleproplinefig}).
 
 A gradient descent, as we have done in the previous chapter, is not
-necessary for fitting the slope of a straight line. More generally,
-this is the case also for fitting the coefficients of linearly
-combined basis functions as for example the slope $m$ and the
-y-intercept $b$ of a straight line
+necessary for fitting the slope of a straight line, because the slope
+can be directly computed via \eqnref{nleslope}. More generally, this
+is the case also for fitting the coefficients of linearly combined
+basis functions as for example the slope $m$ and the y-intercept $b$
+of a straight line
 \[ y = m \cdot x +b \]
-or, more generally, the coefficients $a_k$ of a polynom
+or the coefficients $a_k$ of a polynom
 \[ y = \sum_{k=0}^N a_k x^k = a_o + a_1x + a_2x^2 + a_3x^4 + \ldots \]
 \matlabfun{polyfit()}.
 
@@ -205,6 +206,37 @@ exponential decay
 Such cases require numerical solutions for the optimization of the
 cost function, e.g. the gradient descent \matlabfun{lsqcurvefit()}.
 
+Let us have a closer look on \eqnref{mleslope}. If the standard
+deviation of the data $\sigma_i$ is the same for each data point,
+i.e. $\sigma_i = \sigma_j \; \forall \; i, j$, the standard deviation drops
+out of \eqnref{mleslope} and we get
+\begin{equation}
+  \label{whitemleslope}
+  \theta = \frac{\sum_{i=1}^n x_i y_i}{\sum_{i=1}^n x_i^2}
+\end{equation}
+To see what this expression is, we need to standardize the data. We
+make the data mean free and normalize them to their standard
+deviation, i.e. $x \mapsto (x - \bar x)/\sigma_x$. The resulting
+numbers are also called \enterm{$z$-values} or $z$-scores and they
+have the property $\bar x = 0$ and $\sigma_x = 1$.  If this is the
+case, the variance
+\[ \sigma_x^2 = \frac{1}{n} \sum_{i=1}^n (x_i - \bar x)^2 = \frac{1}{n} \sum_{i=1}^n x_i^2 = 1 \]
+is the mean squared data and equals one.
+The covariance between $x$ and $y$ also simplifies to
+\[ \text{cov}(x, y) = \frac{1}{n} \sum_{i=1}^n (x_i - \bar x)(y_i - \bar y) =\frac{1}{n} \sum_{i=1}^n x_i y_i \]
+the averaged product between pairs of $x$ and $y$ values.
+Recall that the correlation coefficient $r_{x,y}$ is the covariance
+normalized by the product of the standard deviations of $x$ and $y$,
+respectively. Therefore, in case the standard deviations equal one, the
+correlation coefficient equals the covariance.
+Consequently, for standardized data the slope of the regression line
+\eqnref{whitemleslope} simplifies to
+\begin{equation}
+  \theta = \frac{1}{n} \sum_{i=1}^n x_i y_i = \text{cov}(x,y) = r_{x,y} \]
+\end{equation}
+For standardized data the slope of the regression line equals the
+correlation coefficient!
+
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{Fitting probability distributions}
diff --git a/statistics/material/varianceexplained.py b/statistics/material/varianceexplained.py
new file mode 100644
index 0000000..296e00f
--- /dev/null
+++ b/statistics/material/varianceexplained.py
@@ -0,0 +1,40 @@
+import numpy as np
+import scipy.stats as st
+import matplotlib.pyplot as plt
+
+x = np.arange(0.0, 10.0, 0.5)
+ytrue = 2.0*x
+y = ytrue + 5.0*np.random.randn(len(x))
+z = np.polyfit(x, y, 1)
+p = np.poly1d(z)
+
+r, _ = st.pearsonr(x, y)
+
+print('total variance: %g' % np.var(y))
+print('residual variance: %g' % np.var(y-p(x)))
+print('variance explained: %g' % (1.0-np.var(y-p(x))/np.var(y)))
+print('r: %g' % r)
+print('variance explained: %g' % (r*r))
+print('slope: %g' % z[0])
+
+fig, axs = plt.subplots(1, 2, sharex=True, sharey=True)
+axs[0].plot(x, ytrue, 'r')
+axs[0].scatter(x, y, c='b')
+axs[0].plot(x, p(x), 'b')
+axs[1].scatter(x, y-p(x), c='b')
+#plt.show()
+
+xn = (x - np.mean(x))/np.std(x)
+yn = (y - np.mean(y))/np.std(y)
+z = np.polyfit(xn, yn, 1)
+p = np.poly1d(z)
+r, _ = st.pearsonr(xn, yn)
+
+print('r: %g' % r)
+print('slope: %g' % z[0])
+
+fig, ax = plt.subplots()
+ax.scatter(xn, yn, c='b')
+ax.plot(xn, p(xn), 'b')
+plt.show()
+