From 4273a953f6c05b6175a8958bb1833e037ddcaef8 Mon Sep 17 00:00:00 2001
From: Jan Benda <jan.benda@uni-tuebingen.de>
Date: Wed, 11 Dec 2019 09:48:19 +0100
Subject: [PATCH] [regression] further improved the chapter

---
 regression/code/errorGradient.m           |  13 +-
 regression/code/errorSurface.m            |   2 +-
 regression/code/gradientDescent.m         |   4 +-
 regression/code/meanSquaredError.m        |   8 +-
 regression/code/meanSquaredGradient.m     |  14 +--
 regression/lecture/regression-chapter.tex |  14 +--
 regression/lecture/regression.tex         | 145 +++++++++++-----------
 7 files changed, 92 insertions(+), 108 deletions(-)

diff --git a/regression/code/errorGradient.m b/regression/code/errorGradient.m
index b9bb642..2f3ec16 100644
--- a/regression/code/errorGradient.m
+++ b/regression/code/errorGradient.m
@@ -1,22 +1,13 @@
 % x, y, slopes, and intercepts from exercise 8.3
 
-slopes = -5:0.25:5;
-intercepts = -30:1:30;
-error_surface = zeros(length(slopes), length(intercepts));
-for i = 1:length(slopes)
-    for j = 1:length(intercepts)
-	      error_surf(i,j) = meanSquaredError([slopes(i), intercepts(j)], x, y);
-    end
-end
-
 error_surface = zeros(length(slopes), length(intercepts));
 gradient_m = zeros(size(error_surface));
 gradient_b = zeros(size(error_surface));
 
 for i = 1:length(slopes)
     for j = 1:length(intercepts)
-        error_surface(i,j) = meanSquaredError([slopes(i), intercepts(j)], x, y);
-        grad = meanSquaredGradient([slopes(i), intercepts(j)], x, y);
+	error_surface(i,j) = meanSquaredError(x, y, [slopes(i), intercepts(j)]);
+        grad = meanSquaredGradient(x, y, [slopes(i), intercepts(j)]);
         gradient_m(i,j) = grad(1);
         gradient_b(i,j) = grad(2);
     end
diff --git a/regression/code/errorSurface.m b/regression/code/errorSurface.m
index f86e3b1..33380ae 100644
--- a/regression/code/errorSurface.m
+++ b/regression/code/errorSurface.m
@@ -11,7 +11,7 @@ intercepts = -30:1:30;
 error_surface = zeros(length(slopes), length(intercepts));
 for i = 1:length(slopes)
     for j = 1:length(intercepts)
-	      error_surf(i,j) = meanSquaredError([slopes(i), intercepts(j)], x, y);
+	      error_surf(i,j) = meanSquaredError(x, y, [slopes(i), intercepts(j)]);
     end
 end
 
diff --git a/regression/code/gradientDescent.m b/regression/code/gradientDescent.m
index a482279..1158b0b 100644
--- a/regression/code/gradientDescent.m
+++ b/regression/code/gradientDescent.m
@@ -9,8 +9,8 @@ errors = [];
 count = 1;
 eps = 0.01;
 while isempty(gradient) || norm(gradient) > 0.1
-    gradient = meanSquaredGradient(position, x, y);
-    errors(count) = meanSquaredError(position, x, y);
+    gradient = meanSquaredGradient(x, y, position);
+    errors(count) = meanSquaredError(x, y, position);
     position = position - eps .* gradient; 
     count = count + 1;
 end
diff --git a/regression/code/meanSquaredError.m b/regression/code/meanSquaredError.m
index cb21324..6eeea7b 100644
--- a/regression/code/meanSquaredError.m
+++ b/regression/code/meanSquaredError.m
@@ -1,10 +1,10 @@
-function mse = meanSquaredError(parameter, x, y)
+function mse = meanSquaredError(x, y, parameter)
 % Mean squared error between a straight line and data pairs.
 %
-% Arguments: parameter, vector containing slope and intercept 
-  %                       as the 1st and 2nd element, respectively. 
-%            x, vector of the input values
+% Arguments: x, vector of the input values
 %            y, vector of the corresponding measured output values
+%            parameter, vector containing slope and intercept 
+%                       as the 1st and 2nd element, respectively. 
 %
 % Returns:   mse, the mean-squared-error.
 
diff --git a/regression/code/meanSquaredGradient.m b/regression/code/meanSquaredGradient.m
index 9817b0f..b6bd0ad 100644
--- a/regression/code/meanSquaredGradient.m
+++ b/regression/code/meanSquaredGradient.m
@@ -1,16 +1,16 @@
-function gradient = meanSquaredGradient(parameter, x, y)
+function gradient = meanSquaredGradient(x, y, parameter)
 % The gradient of the mean squared error
 %
-% Arguments: parameter, vector containing slope and intercept 
-%                       as the 1st and 2nd element 
-%            x, vector of the input values
+% Arguments: x, vector of the input values
 %            y, vector of the corresponding measured output values
+%            parameter, vector containing slope and intercept 
+%                       as the 1st and 2nd element 
 %
 % Returns: the gradient as a vector with two elements
 
   h = 1e-6;   % stepsize for derivatives
-  mse = meanSquaredError(parameter, x, y);
-  partial_m = (meanSquaredError([parameter(1)+h, parameter(2)], x, y) - mse)/h;
-  partial_n = (meanSquaredError([parameter(1), parameter(2)+h], x, y) - mse)/h;
+  mse = meanSquaredError(x, y, parameter);
+  partial_m = (meanSquaredError(x, y, [parameter(1)+h, parameter(2)]) - mse)/h;
+  partial_n = (meanSquaredError(x, y, [parameter(1), parameter(2)+h]) - mse)/h;
   gradient = [partial_m, partial_n];
 end
diff --git a/regression/lecture/regression-chapter.tex b/regression/lecture/regression-chapter.tex
index 7640387..4ee5414 100644
--- a/regression/lecture/regression-chapter.tex
+++ b/regression/lecture/regression-chapter.tex
@@ -16,9 +16,11 @@
 
 \include{regression}
 
-\section{Improvements}
-Adapt function arguments to matlabs polyfit. That is: first the data
-(x,y) and then the parameter vector. p(1) is slope, p(2) is intercept.
+\subsection{Linear fits}
+\begin{itemize}
+\item Polyfit is easy: unique solution!
+\item Example for overfitting with polyfit of a high order (=number of data points)
+\end{itemize}
 
 \section{Fitting in practice}
 
@@ -34,11 +36,5 @@ Fit with matlab functions lsqcurvefit, polyfit
 \item How to test the quality of a fit? Residuals. $\chi^2$ test. Run-test.
 \end{itemize}
 
-\subsection{Linear fits}
-\begin{itemize}
-\item Polyfit is easy: unique solution!
-\item Example for overfitting with polyfit of a high order (=number of data points)
-\end{itemize}
-
 
 \end{document}
diff --git a/regression/lecture/regression.tex b/regression/lecture/regression.tex
index efc490e..0b78a16 100644
--- a/regression/lecture/regression.tex
+++ b/regression/lecture/regression.tex
@@ -52,40 +52,43 @@ considered an optimal fit. In our example we search the parameter
 combination that describe the relation of $x$ and $y$ best. What is
 meant by this? Each input $x_i$ leads to an measured output $y_i$ and
 for each $x_i$ there is a \emph{prediction} or \emph{estimation}
-$y^{est}_i$ of the output value by the model. At each $x_i$ estimation
-and measurement have a distance or error $y_i - y_i^{est}$. In our
-example the estimation is given by the equation $y_i^{est} =
-f(x;m,b)$. The best fitting model with parameters $m$ and $b$ is the
-one that minimizes the distances between observation $y_i$ and
-estimation $y_i^{est}$ (\figref{leastsquareerrorfig}).
+$y^{est}(x_i)$ of the output value by the model. At each $x_i$
+estimation and measurement have a distance or error $y_i -
+y^{est}(x_i)$. In our example the estimation is given by the equation
+$y^{est}(x_i) = f(x_i;m,b)$. The best fitting model with parameters
+$m$ and $b$ is the one that minimizes the distances between
+observation $y_i$ and estimation $y^{est}(x_i)$
+(\figref{leastsquareerrorfig}).
 
 As a first guess we could simply minimize the sum $\sum_{i=1}^N y_i -
-y^{est}_i$. This approach, however, will not work since a minimal sum
+y^{est}(x_i)$. This approach, however, will not work since a minimal sum
 can also be achieved if half of the measurements is above and the
 other half below the predicted line. Positive and negative errors
 would cancel out and then sum up to values close to zero. A better
 approach is to sum over the absolute values of the distances:
-$\sum_{i=1}^N |y_i - y^{est}_i|$. This sum can only be small if all
+$\sum_{i=1}^N |y_i - y^{est}(x_i)|$. This sum can only be small if all
 deviations are indeed small no matter if they are above or below the
 predicted line. Instead of the sum we could also take the average
 \begin{equation}
   \label{meanabserror}
-  f_{dist}(\{(x_i, y_i)\}|\{y^{est}_i\}) = \frac{1}{N} \sum_{i=1}^N |y_i - y^{est}_i|
+  f_{dist}(\{(x_i, y_i)\}|\{y^{est}(x_i)\}) = \frac{1}{N} \sum_{i=1}^N |y_i - y^{est}(x_i)|
 \end{equation}
-For reasons that are explained in
-chapter~\ref{maximumlikelihoodchapter}, instead of the averaged
-absolute errors, the \enterm[mean squared error]{mean squared error}
-(\determ[quadratischer Fehler!mittlerer]{mittlerer quadratischer
-  Fehler})
+Instead of the averaged absolute errors, the \enterm[mean squared
+error]{mean squared error} (\determ[quadratischer
+Fehler!mittlerer]{mittlerer quadratischer Fehler})
 \begin{equation}
   \label{meansquarederror}
-  f_{mse}(\{(x_i, y_i)\}|\{y^{est}_i\}) = \frac{1}{N} \sum_{i=1}^N (y_i - y^{est}_i)^2
+  f_{mse}(\{(x_i, y_i)\}|\{y^{est}(x_i)\}) = \frac{1}{N} \sum_{i=1}^N (y_i - y^{est}(x_i))^2
 \end{equation}
 is commonly used (\figref{leastsquareerrorfig}). Similar to the
-absolute distance, the square of the errors, $(y_i - y_i^{est})^2$, is
+absolute distance, the square of the errors, $(y_i - y^{est}(x_i))^2$, is
 always positive and thus positive and negative error values do not
 cancel each other out. In addition, the square punishes large
-deviations over small deviations.
+deviations over small deviations. In
+chapter~\ref{maximumlikelihoodchapter} we show that minimizing the
+mean square error is equivalent to maximizing the likelihood that the
+observations originate from the model, if the data are normally
+distributed around the model prediction.
 
 \begin{exercise}{meanSquaredErrorLine.m}{}\label{mseexercise}%
   Given a vector of observations \varcode{y} and a vector with the
@@ -98,20 +101,13 @@ deviations over small deviations.
 \section{Objective function}
 
 The mean squared error is a so called \enterm{objective function} or
-\enterm{cost function} (\determ{Kostenfunktion}), $f_{cost}(\{(x_i,
-y_i)\}|\{y^{est}_i\})$. A cost function assigns to the given data set
-$\{(x_i, y_i)\}$ and corresponding model predictions $\{y^{est}_i\}$ a
-single scalar value that we want to minimize. Here we aim to adapt the
-model parameters to minimize the mean squared error
-\eqref{meansquarederror}. In chapter~\ref{maximumlikelihoodchapter} we
-show that the minimization of the mean square error is equivalent to
-maximizing the likelihood that the observations originate from the
-model (assuming a normal distribution of the data around the model
-prediction). The \enterm{cost function} does not have to be the mean
-square error but can be any function that maps the data and the
-predictions to a scalar value describing the quality of the fit. In
-the optimization process we aim for the paramter combination that
-minimizes the costs.
+\enterm{cost function} (\determ{Kostenfunktion}). A cost function
+assigns to a model prediction $\{y^{est}(x_i)\}$ for a given data set
+$\{(x_i, y_i)\}$ a single scalar value that we want to minimize.  Here
+we aim to adapt the model parameters to minimize the mean squared
+error \eqref{meansquarederror}. In general, the \enterm{cost function}
+can be any function that describes the quality of the fit by mapping
+the data and the predictions to a single scalar value.
 
 \begin{figure}[t]
   \includegraphics[width=1\textwidth]{linear_least_squares}
@@ -123,41 +119,40 @@ minimizes the costs.
   \label{leastsquareerrorfig}
 \end{figure}
 
-Replacing $y^{est}$ with our model, the straight line
-\eqref{straightline}, yields
+Replacing $y^{est}$ in the mean squared error \eqref{meansquarederror}
+with our model, the straight line \eqref{straightline}, the cost
+function reads
 \begin{eqnarray}
-  f_{cost}(\{(x_i, y_i)\}|m,b) & = & \frac{1}{N} \sum_{i=1}^N (y_i - f(x_i;m,b))^2 \label{msefunc} \\
+  f_{cost}(m,b|\{(x_i, y_i)\}) & = & \frac{1}{N} \sum_{i=1}^N (y_i - f(x_i;m,b))^2 \label{msefunc} \\
   & = & \frac{1}{N} \sum_{i=1}^N (y_i - m x_i - b)^2 \label{mseline}
 \end{eqnarray}
-That is, the mean square error is given by the pairs $(x_i, y_i)$ of
-measurements and the parameters $m$ and $b$ of the straight line. The
-optimization process tries to find $m$ and $b$ such that the cost
-function is minimized. With the mean squared error as the cost
-function this optimization process is also called method of the
-\enterm{least square error} (\determ[quadratischer
+The optimization process tries to find the slope $m$ and the intercept
+$b$ such that the cost function is minimized. With the mean squared
+error as the cost function this optimization process is also called
+method of the \enterm{least square error} (\determ[quadratischer
 Fehler!kleinster]{Methode der kleinsten Quadrate}).
 
 \begin{exercise}{meanSquaredError.m}{}
-  Implement the objective function \varcode{meanSquaredError()} that
-  uses a straight line, \eqnref{straightline}, as a model.  The
-  function takes three arguments. The first is a 2-element vector that
-  contains the values of parameters \varcode{m} and \varcode{b}. The
-  second is a vector of x-values, and the third contains the
-  measurements for each value of $x$, the respective $y$-values.  The
-  function returns the mean square error \eqnref{mseline}.
+  Implement the objective function \eqref{mseline} as a function
+  \varcode{meanSquaredError()}.  The function takes three
+  arguments. The first is a vector of $x$-values and the second
+  contains the measurements $y$ for each value of $x$. The third
+  argument is a 2-element vector that contains the values of
+  parameters \varcode{m} and \varcode{b}. The function returns the
+  mean square error.
 \end{exercise}
 
 
 \section{Error surface}
 For each combination of the two parameters $m$ and $b$ of the model we
 can use \eqnref{mseline} to calculate the corresponding value of the
-cost function. We thus consider the cost function $f_{cost}(\{(x_i,
-y_i)\}|m,b)$ as a function $f_{cost}(m,b)$, that maps the parameter
-values $m$ and $b$ to an error value.  The error values describe a
-landscape over the $m$-$b$ plane, the error surface, that can be
-illustrated graphically using a 3-d surface-plot. $m$ and $b$ are
-plotted on the $x-$ and $y-$ axis while the third dimension indicates
-the error value (\figref{errorsurfacefig}).
+cost function. The cost function $f_{cost}(m,b|\{(x_i, y_i)\}|)$ is a
+function $f_{cost}(m,b)$, that maps the parameter values $m$ and $b$
+to a scalar error value.  The error values describe a landscape over the
+$m$-$b$ plane, the error surface, that can be illustrated graphically
+using a 3-d surface-plot. $m$ and $b$ are plotted on the $x$- and $y$-
+axis while the third dimension indicates the error value
+(\figref{errorsurfacefig}).
 
 \begin{figure}[t]
   \includegraphics[width=0.75\textwidth]{error_surface}
@@ -176,8 +171,8 @@ the error value (\figref{errorsurfacefig}).
   calculate the mean squared error between the data and straight lines
   for a range of slopes and intercepts using the
   \varcode{meanSquaredError()} function from the previous exercise.
-  Illustrates the error surface using the \code{surface()} function
-  (consult the help to find out how to use \code{surface()}).
+  Illustrates the error surface using the \code{surface()} function.
+  Consult the documentation to find out how to use \code{surface()}.
 \end{exercise}
 
 By looking at the error surface we can directly see the position of
@@ -185,19 +180,21 @@ the minimum and thus estimate the optimal parameter combination. How
 can we use the error surface to guide an automatic optimization
 process?
 
-The obvious approach would be to calculate the error surface and then
-find the position of the minimum using the \code{min} function. This
-approach, however has several disadvantages: (i) it is computationally
-very expensive to calculate the error for each parameter
-combination. The number of combinations increases exponentially with
-the number of free parameters (also known as the ``curse of
-dimensionality''). (ii) the accuracy with which the best parameters
-can be estimated is limited by the resolution used to sample the
-parameter space. The coarser the parameters are sampled the less
-precise is the obtained position of the minimum.
-
-We want a procedure that finds the minimum of the cost function with a minimal number
-of computations and to arbitrary precision.
+The obvious approach would be to calculate the error surface for any
+combination of slope and intercept values and then find the position
+of the minimum using the \code{min} function. This approach, however
+has several disadvantages: (i) it is computationally very expensive to
+calculate the error for each parameter combination. The number of
+combinations increases exponentially with the number of free
+parameters (also known as the ``curse of dimensionality''). (ii) the
+accuracy with which the best parameters can be estimated is limited by
+the resolution used to sample the parameter space. The coarser the
+parameters are sampled the less precise is the obtained position of
+the minimum.
+
+So we need a different approach. We want a procedure that finds the
+minimum of the cost function with a minimal number of computations and
+to arbitrary precision.
 
 \begin{ibox}[t]{\label{differentialquotientbox}Difference quotient and derivative}
   \includegraphics[width=0.33\textwidth]{derivative}
@@ -308,9 +305,9 @@ choose the opposite direction.
 
 \begin{exercise}{meanSquaredGradient.m}{}\label{gradientexercise}%
   Implement a function \varcode{meanSquaredGradient()}, that takes the
-  set of parameters $(m, b)$ of a straight line as a two-element
-  vector and the $x$- and $y$-data as input arguments. The function
-  should return the gradient at that position as a vector with two
+  $x$- and $y$-data and the set of parameters $(m, b)$ of a straight
+  line as a two-element vector as input arguments. The function should
+  return the gradient at the position $(m, b)$ as a vector with two
   elements.
 \end{exercise}
 
@@ -359,7 +356,7 @@ distance between the red dots in \figref{gradientdescentfig}) is
 large.
 
 \begin{figure}[t]
-  \includegraphics[width=0.55\textwidth]{gradient_descent}
+  \includegraphics[width=0.45\textwidth]{gradient_descent}
   \titlecaption{Gradient descent.}{The algorithm starts at an
     arbitrary position. At each point the gradient is estimated and
     the position is updated as long as the length of the gradient is
@@ -376,7 +373,7 @@ large.
   \item Plot the error values as a function of the iterations, the
     number of optimization steps.
   \item Plot the measured data together with the best fitting straight line.
-  \end{enumerate}
+  \end{enumerate}\vspace{-4.5ex}
 \end{exercise}