[regression] further improved the chapter

2019-12-11 09:48:19 +01:00 · 2019-12-11 09:48:19 +01:00 · 4273a953f6
commit 4273a953f6
parent 9cc839de94
7 changed files with 92 additions and 108 deletions
--- a/regression/code/errorGradient.m
+++ b/regression/code/errorGradient.m
@ -1,22 +1,13 @@
 % x, y, slopes, and intercepts from exercise 8.3
 slopes = -5:0.25:5;
 intercepts = -30:1:30;
 error_surface = zeros(length(slopes), length(intercepts));
 for i = 1:length(slopes)
    for j = 1:length(intercepts)
 	      error_surf(i,j) = meanSquaredError([slopes(i), intercepts(j)], x, y);
    end
 end
 error_surface = zeros(length(slopes), length(intercepts));
 gradient_m = zeros(size(error_surface));
 gradient_b = zeros(size(error_surface));
 for i = 1:length(slopes)
    for j = 1:length(intercepts)
-        error_surface(i,j) = meanSquaredError([slopes(i), intercepts(j)], x, y);
+	error_surface(i,j) = meanSquaredError(x, y, [slopes(i), intercepts(j)]);
-        grad = meanSquaredGradient([slopes(i), intercepts(j)], x, y);
+        grad = meanSquaredGradient(x, y, [slopes(i), intercepts(j)]);
        gradient_m(i,j) = grad(1);
        gradient_b(i,j) = grad(2);
    end
--- a/regression/code/errorSurface.m
+++ b/regression/code/errorSurface.m
@ -11,7 +11,7 @@ intercepts = -30:1:30;
 error_surface = zeros(length(slopes), length(intercepts));
 for i = 1:length(slopes)
    for j = 1:length(intercepts)
-	      error_surf(i,j) = meanSquaredError([slopes(i), intercepts(j)], x, y);
+	      error_surf(i,j) = meanSquaredError(x, y, [slopes(i), intercepts(j)]);
    end
 end
--- a/regression/code/gradientDescent.m
+++ b/regression/code/gradientDescent.m
@ -9,8 +9,8 @@ errors = [];
 count = 1;
 eps = 0.01;
 while isempty(gradient) || norm(gradient) > 0.1
-    gradient = meanSquaredGradient(position, x, y);
+    gradient = meanSquaredGradient(x, y, position);
-    errors(count) = meanSquaredError(position, x, y);
+    errors(count) = meanSquaredError(x, y, position);
    position = position - eps .* gradient; 
    count = count + 1;
 end
--- a/regression/code/meanSquaredError.m
+++ b/regression/code/meanSquaredError.m
@ -1,10 +1,10 @@
-function mse = meanSquaredError(parameter, x, y)
+function mse = meanSquaredError(x, y, parameter)
 % Mean squared error between a straight line and data pairs.
 %
-% Arguments: parameter, vector containing slope and intercept 
+% Arguments: x, vector of the input values
  %                       as the 1st and 2nd element, respectively. 
 %            x, vector of the input values
 %            y, vector of the corresponding measured output values
 %            parameter, vector containing slope and intercept 
 %                       as the 1st and 2nd element, respectively. 
 %
 % Returns:   mse, the mean-squared-error.
--- a/regression/code/meanSquaredGradient.m
+++ b/regression/code/meanSquaredGradient.m
@ -1,16 +1,16 @@
-function gradient = meanSquaredGradient(parameter, x, y)
+function gradient = meanSquaredGradient(x, y, parameter)
 % The gradient of the mean squared error
 %
-% Arguments: parameter, vector containing slope and intercept 
+% Arguments: x, vector of the input values
 %                       as the 1st and 2nd element 
 %            x, vector of the input values
 %            y, vector of the corresponding measured output values
 %            parameter, vector containing slope and intercept 
 %                       as the 1st and 2nd element 
 %
 % Returns: the gradient as a vector with two elements
  h = 1e-6;   % stepsize for derivatives
-  mse = meanSquaredError(parameter, x, y);
+  mse = meanSquaredError(x, y, parameter);
-  partial_m = (meanSquaredError([parameter(1)+h, parameter(2)], x, y) - mse)/h;
+  partial_m = (meanSquaredError(x, y, [parameter(1)+h, parameter(2)]) - mse)/h;
-  partial_n = (meanSquaredError([parameter(1), parameter(2)+h], x, y) - mse)/h;
+  partial_n = (meanSquaredError(x, y, [parameter(1), parameter(2)+h]) - mse)/h;
  gradient = [partial_m, partial_n];
 end
--- a/regression/lecture/regression-chapter.tex
+++ b/regression/lecture/regression-chapter.tex
@ -16,9 +16,11 @@
 \include{regression}
-\section{Improvements}
+\subsection{Linear fits}
-Adapt function arguments to matlabs polyfit. That is: first the data
+\begin{itemize}
-(x,y) and then the parameter vector. p(1) is slope, p(2) is intercept.
+\item Polyfit is easy: unique solution!
 \item Example for overfitting with polyfit of a high order (=number of data points)
 \end{itemize}
 \section{Fitting in practice}
@ -34,11 +36,5 @@ Fit with matlab functions lsqcurvefit, polyfit
 \item How to test the quality of a fit? Residuals. $\chi^2$ test. Run-test.
 \end{itemize}
 \subsection{Linear fits}
 \begin{itemize}
 \item Polyfit is easy: unique solution!
 \item Example for overfitting with polyfit of a high order (=number of data points)
 \end{itemize}
 \end{document}
--- a/regression/lecture/regression.tex
+++ b/regression/lecture/regression.tex
@ -52,40 +52,43 @@ considered an optimal fit. In our example we search the parameter
 combination that describe the relation of $x$ and $y$ best. What is
 meant by this? Each input $x_i$ leads to an measured output $y_i$ and
 for each $x_i$ there is a \emph{prediction} or \emph{estimation}
-$y^{est}_i$ of the output value by the model. At each $x_i$ estimation
+$y^{est}(x_i)$ of the output value by the model. At each $x_i$
-and measurement have a distance or error $y_i - y_i^{est}$. In our
+estimation and measurement have a distance or error $y_i -
-example the estimation is given by the equation $y_i^{est} =
+y^{est}(x_i)$. In our example the estimation is given by the equation
-f(x;m,b)$. The best fitting model with parameters $m$ and $b$ is the
+$y^{est}(x_i) = f(x_i;m,b)$. The best fitting model with parameters
-one that minimizes the distances between observation $y_i$ and
+$m$ and $b$ is the one that minimizes the distances between
-estimation $y_i^{est}$ (\figref{leastsquareerrorfig}).
+observation $y_i$ and estimation $y^{est}(x_i)$
 (\figref{leastsquareerrorfig}).
 As a first guess we could simply minimize the sum $\sum_{i=1}^N y_i -
-y^{est}_i$. This approach, however, will not work since a minimal sum
+y^{est}(x_i)$. This approach, however, will not work since a minimal sum
 can also be achieved if half of the measurements is above and the
 other half below the predicted line. Positive and negative errors
 would cancel out and then sum up to values close to zero. A better
 approach is to sum over the absolute values of the distances:
-$\sum_{i=1}^N |y_i - y^{est}_i|$. This sum can only be small if all
+$\sum_{i=1}^N |y_i - y^{est}(x_i)|$. This sum can only be small if all
 deviations are indeed small no matter if they are above or below the
 predicted line. Instead of the sum we could also take the average
 \begin{equation}
  \label{meanabserror}
-  f_{dist}(\{(x_i, y_i)\}|\{y^{est}_i\}) = \frac{1}{N} \sum_{i=1}^N |y_i - y^{est}_i|
+  f_{dist}(\{(x_i, y_i)\}|\{y^{est}(x_i)\}) = \frac{1}{N} \sum_{i=1}^N |y_i - y^{est}(x_i)|
 \end{equation}
-For reasons that are explained in
+Instead of the averaged absolute errors, the \enterm[mean squared
-chapter~\ref{maximumlikelihoodchapter}, instead of the averaged
+error]{mean squared error} (\determ[quadratischer
-absolute errors, the \enterm[mean squared error]{mean squared error}
+Fehler!mittlerer]{mittlerer quadratischer Fehler})
 (\determ[quadratischer Fehler!mittlerer]{mittlerer quadratischer
  Fehler})
 \begin{equation}
  \label{meansquarederror}
-  f_{mse}(\{(x_i, y_i)\}|\{y^{est}_i\}) = \frac{1}{N} \sum_{i=1}^N (y_i - y^{est}_i)^2
+  f_{mse}(\{(x_i, y_i)\}|\{y^{est}(x_i)\}) = \frac{1}{N} \sum_{i=1}^N (y_i - y^{est}(x_i))^2
 \end{equation}
 is commonly used (\figref{leastsquareerrorfig}). Similar to the
-absolute distance, the square of the errors, $(y_i - y_i^{est})^2$, is
+absolute distance, the square of the errors, $(y_i - y^{est}(x_i))^2$, is
 always positive and thus positive and negative error values do not
 cancel each other out. In addition, the square punishes large
-deviations over small deviations.
+deviations over small deviations. In
 chapter~\ref{maximumlikelihoodchapter} we show that minimizing the
 mean square error is equivalent to maximizing the likelihood that the
 observations originate from the model, if the data are normally
 distributed around the model prediction.
 \begin{exercise}{meanSquaredErrorLine.m}{}\label{mseexercise}%
  Given a vector of observations \varcode{y} and a vector with the
@ -98,20 +101,13 @@ deviations over small deviations.
 \section{Objective function}
 The mean squared error is a so called \enterm{objective function} or
-\enterm{cost function} (\determ{Kostenfunktion}), $f_{cost}(\{(x_i,
+\enterm{cost function} (\determ{Kostenfunktion}). A cost function
-y_i)\}|\{y^{est}_i\})$. A cost function assigns to the given data set
+assigns to a model prediction $\{y^{est}(x_i)\}$ for a given data set
-$\{(x_i, y_i)\}$ and corresponding model predictions $\{y^{est}_i\}$ a
+$\{(x_i, y_i)\}$ a single scalar value that we want to minimize.  Here
-single scalar value that we want to minimize. Here we aim to adapt the
+we aim to adapt the model parameters to minimize the mean squared
-model parameters to minimize the mean squared error
+error \eqref{meansquarederror}. In general, the \enterm{cost function}
-\eqref{meansquarederror}. In chapter~\ref{maximumlikelihoodchapter} we
+can be any function that describes the quality of the fit by mapping
-show that the minimization of the mean square error is equivalent to
+the data and the predictions to a single scalar value.
 maximizing the likelihood that the observations originate from the
 model (assuming a normal distribution of the data around the model
 prediction). The \enterm{cost function} does not have to be the mean
 square error but can be any function that maps the data and the
 predictions to a scalar value describing the quality of the fit. In
 the optimization process we aim for the paramter combination that
 minimizes the costs.
 \begin{figure}[t]
  \includegraphics[width=1\textwidth]{linear_least_squares}
@ -123,41 +119,40 @@ minimizes the costs.
  \label{leastsquareerrorfig}
 \end{figure}
-Replacing $y^{est}$ with our model, the straight line
+Replacing $y^{est}$ in the mean squared error \eqref{meansquarederror}
-\eqref{straightline}, yields
+with our model, the straight line \eqref{straightline}, the cost
 function reads
 \begin{eqnarray}
-  f_{cost}(\{(x_i, y_i)\}|m,b) & = & \frac{1}{N} \sum_{i=1}^N (y_i - f(x_i;m,b))^2 \label{msefunc} \\
+  f_{cost}(m,b|\{(x_i, y_i)\}) & = & \frac{1}{N} \sum_{i=1}^N (y_i - f(x_i;m,b))^2 \label{msefunc} \\
  & = & \frac{1}{N} \sum_{i=1}^N (y_i - m x_i - b)^2 \label{mseline}
 \end{eqnarray}
-That is, the mean square error is given by the pairs $(x_i, y_i)$ of
+The optimization process tries to find the slope $m$ and the intercept
-measurements and the parameters $m$ and $b$ of the straight line. The
+$b$ such that the cost function is minimized. With the mean squared
-optimization process tries to find $m$ and $b$ such that the cost
+error as the cost function this optimization process is also called
-function is minimized. With the mean squared error as the cost
+method of the \enterm{least square error} (\determ[quadratischer
 function this optimization process is also called method of the
 \enterm{least square error} (\determ[quadratischer
 Fehler!kleinster]{Methode der kleinsten Quadrate}).
 \begin{exercise}{meanSquaredError.m}{}
-  Implement the objective function \varcode{meanSquaredError()} that
+  Implement the objective function \eqref{mseline} as a function
-  uses a straight line, \eqnref{straightline}, as a model.  The
+  \varcode{meanSquaredError()}.  The function takes three
-  function takes three arguments. The first is a 2-element vector that
+  arguments. The first is a vector of $x$-values and the second
-  contains the values of parameters \varcode{m} and \varcode{b}. The
+  contains the measurements $y$ for each value of $x$. The third
-  second is a vector of x-values, and the third contains the
+  argument is a 2-element vector that contains the values of
-  measurements for each value of $x$, the respective $y$-values.  The
+  parameters \varcode{m} and \varcode{b}. The function returns the
-  function returns the mean square error \eqnref{mseline}.
+  mean square error.
 \end{exercise}
 \section{Error surface}
 For each combination of the two parameters $m$ and $b$ of the model we
 can use \eqnref{mseline} to calculate the corresponding value of the
-cost function. We thus consider the cost function $f_{cost}(\{(x_i,
+cost function. The cost function $f_{cost}(m,b|\{(x_i, y_i)\}|)$ is a
-y_i)\}|m,b)$ as a function $f_{cost}(m,b)$, that maps the parameter
+function $f_{cost}(m,b)$, that maps the parameter values $m$ and $b$
-values $m$ and $b$ to an error value.  The error values describe a
+to a scalar error value.  The error values describe a landscape over the
-landscape over the $m$-$b$ plane, the error surface, that can be
+$m$-$b$ plane, the error surface, that can be illustrated graphically
-illustrated graphically using a 3-d surface-plot. $m$ and $b$ are
+using a 3-d surface-plot. $m$ and $b$ are plotted on the $x$- and $y$-
-plotted on the $x-$ and $y-$ axis while the third dimension indicates
+axis while the third dimension indicates the error value
-the error value (\figref{errorsurfacefig}).
+(\figref{errorsurfacefig}).
 \begin{figure}[t]
  \includegraphics[width=0.75\textwidth]{error_surface}
@ -176,8 +171,8 @@ the error value (\figref{errorsurfacefig}).
  calculate the mean squared error between the data and straight lines
  for a range of slopes and intercepts using the
  \varcode{meanSquaredError()} function from the previous exercise.
-  Illustrates the error surface using the \code{surface()} function
+  Illustrates the error surface using the \code{surface()} function.
-  (consult the help to find out how to use \code{surface()}).
+  Consult the documentation to find out how to use \code{surface()}.
 \end{exercise}
 By looking at the error surface we can directly see the position of
@ -185,19 +180,21 @@ the minimum and thus estimate the optimal parameter combination. How
 can we use the error surface to guide an automatic optimization
 process?
-The obvious approach would be to calculate the error surface and then
+The obvious approach would be to calculate the error surface for any
-find the position of the minimum using the \code{min} function. This
+combination of slope and intercept values and then find the position
-approach, however has several disadvantages: (i) it is computationally
+of the minimum using the \code{min} function. This approach, however
-very expensive to calculate the error for each parameter
+has several disadvantages: (i) it is computationally very expensive to
-combination. The number of combinations increases exponentially with
+calculate the error for each parameter combination. The number of
-the number of free parameters (also known as the ``curse of
+combinations increases exponentially with the number of free
-dimensionality''). (ii) the accuracy with which the best parameters
+parameters (also known as the ``curse of dimensionality''). (ii) the
-can be estimated is limited by the resolution used to sample the
+accuracy with which the best parameters can be estimated is limited by
-parameter space. The coarser the parameters are sampled the less
+the resolution used to sample the parameter space. The coarser the
-precise is the obtained position of the minimum.
+parameters are sampled the less precise is the obtained position of
-
+the minimum.
-We want a procedure that finds the minimum of the cost function with a minimal number
+
-of computations and to arbitrary precision.
+So we need a different approach. We want a procedure that finds the
 minimum of the cost function with a minimal number of computations and
 to arbitrary precision.
 \begin{ibox}[t]{\label{differentialquotientbox}Difference quotient and derivative}
  \includegraphics[width=0.33\textwidth]{derivative}
@ -308,9 +305,9 @@ choose the opposite direction.
 \begin{exercise}{meanSquaredGradient.m}{}\label{gradientexercise}%
  Implement a function \varcode{meanSquaredGradient()}, that takes the
-  set of parameters $(m, b)$ of a straight line as a two-element
+  $x$- and $y$-data and the set of parameters $(m, b)$ of a straight
-  vector and the $x$- and $y$-data as input arguments. The function
+  line as a two-element vector as input arguments. The function should
-  should return the gradient at that position as a vector with two
+  return the gradient at the position $(m, b)$ as a vector with two
  elements.
 \end{exercise}
@ -359,7 +356,7 @@ distance between the red dots in \figref{gradientdescentfig}) is
 large.
 \begin{figure}[t]
-  \includegraphics[width=0.55\textwidth]{gradient_descent}
+  \includegraphics[width=0.45\textwidth]{gradient_descent}
  \titlecaption{Gradient descent.}{The algorithm starts at an
    arbitrary position. At each point the gradient is estimated and
    the position is updated as long as the length of the gradient is
@ -376,7 +373,7 @@ large.
  \item Plot the error values as a function of the iterations, the
    number of optimization steps.
  \item Plot the measured data together with the best fitting straight line.
-  \end{enumerate}
+  \end{enumerate}\vspace{-4.5ex}
 \end{exercise}