[regression] improved the chapter

This commit is contained in:
Jan Benda 2019-12-10 22:29:35 +01:00
parent 060be5578b
commit 3d600e6ab7
9 changed files with 251 additions and 241 deletions

View File

@ -1,26 +1,33 @@
load('lin_regression.mat') % x, y, slopes, and intercepts from exercise 8.3
ms = -1:0.5:5;
ns = -10:1:10;
error_surf = zeros(length(ms), length(ns)); slopes = -5:0.25:5;
gradient_m = zeros(size(error_surf)); intercepts = -30:1:30;
gradient_n = zeros(size(error_surf)); error_surface = zeros(length(slopes), length(intercepts));
for i = 1:length(slopes)
for j = 1:length(intercepts)
error_surf(i,j) = meanSquaredError([slopes(i), intercepts(j)], x, y);
end
end
error_surface = zeros(length(slopes), length(intercepts));
gradient_m = zeros(size(error_surface));
gradient_b = zeros(size(error_surface));
for i = 1:length(ms) for i = 1:length(slopes)
for j = 1:length(ns) for j = 1:length(intercepts)
error_surf(i,j) = lsqError([ms(i), ns(j)], x, y); error_surface(i,j) = meanSquaredError([slopes(i), intercepts(j)], x, y);
grad = lsqGradient([ms(i), ns(j)], x, y); grad = meanSquaredGradient([slopes(i), intercepts(j)], x, y);
gradient_m(i,j) = grad(1); gradient_m(i,j) = grad(1);
gradient_n(i,j) = grad(2); gradient_b(i,j) = grad(2);
end end
end end
figure() figure()
hold on hold on
[N, M] = meshgrid(ns, ms); [N, M] = meshgrid(intercepts, slopes);
%surface(M,N, error_surf, 'FaceAlpha', 0.5); %surface(M, N, error_surface, 'FaceAlpha', 0.5);
contour(M,N, error_surf, 50); contour(M, N, error_surface, 50);
quiver(M,N, gradient_m, gradient_n) quiver(M, N, gradient_m, gradient_b)
xlabel('Slope m') xlabel('Slope m')
ylabel('Intercept b') ylabel('Intercept b')
zlabel('Mean squared error') zlabel('Mean squared error')

View File

@ -1,19 +1,24 @@
load('lin_regression.mat'); % generate data:
m = 0.75;
b = -40.0;
n = 20;
x = 120.0*rand(n, 1);
y = m*x + b + 15.0*randn(n, 1);
% compute mean squared error for a range of slopes and intercepts: % compute mean squared error for a range of slopes and intercepts:
slopes = -5:0.25:5; slopes = -5:0.25:5;
intercepts = -30:1:30; intercepts = -30:1:30;
error_surf = zeros(length(slopes), length(intercepts)); error_surface = zeros(length(slopes), length(intercepts));
for i = 1:length(slopes) for i = 1:length(slopes)
for j = 1:length(intercepts) for j = 1:length(intercepts)
error_surf(i,j) = lsqError([slopes(i), intercepts(j)], x, y); error_surf(i,j) = meanSquaredError([slopes(i), intercepts(j)], x, y);
end end
end end
% plot the error surface: % plot the error surface:
figure() figure()
[N,M] = meshgrid(intercepts, slopes); [N,M] = meshgrid(intercepts, slopes);
s = surface(M, N, error_surf); surface(M, N, error_surface);
xlabel('slope', 'rotation', 7.5) xlabel('slope', 'rotation', 7.5)
ylabel('intercept', 'rotation', -22.5) ylabel('intercept', 'rotation', -22.5)
zlabel('error') zlabel('error')

View File

@ -1,26 +1,27 @@
clear % x, y from exercise 8.3
close all
load('lin_regression.mat')
% some arbitrary values for the slope and the intercept to start with:
position = [-2. 10.]; position = [-2. 10.];
% gradient descent:
gradient = []; gradient = [];
errors = []; errors = [];
count = 1; count = 1;
eps = 0.01; eps = 0.01;
while isempty(gradient) || norm(gradient) > 0.1 while isempty(gradient) || norm(gradient) > 0.1
gradient = lsqGradient(position, x,y); gradient = meanSquaredGradient(position, x, y);
errors(count) = lsqError(position, x, y); errors(count) = meanSquaredError(position, x, y);
position = position - eps .* gradient; position = position - eps .* gradient;
count = count + 1; count = count + 1;
end end
figure() figure()
subplot(2,1,1) subplot(2,1,1)
hold on hold on
scatter(x,y, 'displayname', 'data') scatter(x, y, 'displayname', 'data')
xaxis = min(x):0.01:max(x); xx = min(x):0.01:max(x);
f_x = position(1).*xaxis + position(2); yy = position(1).*xx + position(2);
plot(xaxis, f_x, 'displayname', 'fit') plot(xx, yy, 'displayname', 'fit')
xlabel('Input') xlabel('Input')
ylabel('Output') ylabel('Output')
grid on grid on

View File

@ -1,13 +0,0 @@
function error = lsqError(parameter, x, y)
% Objective function for fitting a linear equation to data.
%
% Arguments: parameter, vector containing slope and intercept
% as the 1st and 2nd element
% x, vector of the input values
% y, vector of the corresponding measured output values
%
% Returns: the estimation error in terms of the mean sqaure error
y_est = x .* parameter(1) + parameter(2);
error = meanSquareError(y, y_est);
end

View File

@ -1,10 +0,0 @@
function error = meanSquareError(y, y_est)
% Mean squared error between observed and predicted values.
%
% Arguments: y, vector of observed values.
% y_est, vector of predicted values.
%
% Returns: the error in the mean-squared-deviation sense.
error = mean((y - y_est).^2);
end

View File

@ -0,0 +1,12 @@
function mse = meanSquaredError(parameter, x, y)
% Mean squared error between a straight line and data pairs.
%
% Arguments: parameter, vector containing slope and intercept
% as the 1st and 2nd element, respectively.
% x, vector of the input values
% y, vector of the corresponding measured output values
%
% Returns: mse, the mean-squared-error.
mse = mean((y - x * parameter(1) - parameter(2)).^2);
end

View File

@ -0,0 +1 @@
mse = mean((y - y_est).^2);

View File

@ -1,5 +1,5 @@
function gradient = lsqGradient(parameter, x, y) function gradient = meanSquaredGradient(parameter, x, y)
% The gradient of the least square error % The gradient of the mean squared error
% %
% Arguments: parameter, vector containing slope and intercept % Arguments: parameter, vector containing slope and intercept
% as the 1st and 2nd element % as the 1st and 2nd element
@ -7,8 +7,10 @@ function gradient = lsqGradient(parameter, x, y)
% y, vector of the corresponding measured output values % y, vector of the corresponding measured output values
% %
% Returns: the gradient as a vector with two elements % Returns: the gradient as a vector with two elements
h = 1e-6; % stepsize for derivatives h = 1e-6; % stepsize for derivatives
partial_m = (lsqError([parameter(1)+h, parameter(2)], x, y) - lsqError(parameter, x, y))/ h; mse = meanSquaredError(parameter, x, y);
partial_n = (lsqError([parameter(1), parameter(2)+h], x, y) - lsqError(parameter, x, y))/ h; partial_m = (meanSquaredError([parameter(1)+h, parameter(2)], x, y) - mse)/h;
partial_n = (meanSquaredError([parameter(1), parameter(2)+h], x, y) - mse)/h;
gradient = [partial_m, partial_n]; gradient = [partial_m, partial_n];
end end

View File

@ -18,6 +18,8 @@ the response values is minimized. One basic numerical method used for
such optimization problems is the so called gradient descent, which is such optimization problems is the so called gradient descent, which is
introduced in this chapter. introduced in this chapter.
%%% Weiteres einfaches verbales Beispiel? Eventuell aus der Populationsoekologie?
\begin{figure}[t] \begin{figure}[t]
\includegraphics[width=1\textwidth]{lin_regress}\hfill \includegraphics[width=1\textwidth]{lin_regress}\hfill
\titlecaption{Example data suggesting a linear relation.}{A set of \titlecaption{Example data suggesting a linear relation.}{A set of
@ -31,7 +33,10 @@ introduced in this chapter.
The data plotted in \figref{linregressiondatafig} suggest a linear The data plotted in \figref{linregressiondatafig} suggest a linear
relation between input and output of the system. We thus assume that a relation between input and output of the system. We thus assume that a
straight line straight line
\[y = f(x; m, b) = m\cdot x + b \] \begin{equation}
\label{straightline}
y = f(x; m, b) = m\cdot x + b
\end{equation}
is an appropriate model to describe the system. The line has two free is an appropriate model to describe the system. The line has two free
parameter, the slope $m$ and the $y$-intercept $b$. We need to find parameter, the slope $m$ and the $y$-intercept $b$. We need to find
values for the slope and the intercept that best describe the measured values for the slope and the intercept that best describe the measured
@ -45,59 +50,68 @@ slope and intercept that best describes the system.
Before the optimization can be done we need to specify what exactly is Before the optimization can be done we need to specify what exactly is
considered an optimal fit. In our example we search the parameter considered an optimal fit. In our example we search the parameter
combination that describe the relation of $x$ and $y$ best. What is combination that describe the relation of $x$ and $y$ best. What is
meant by this? Each input $x_i$ leads to an output $y_i$ and for each meant by this? Each input $x_i$ leads to an measured output $y_i$ and
$x_i$ there is a \emph{prediction} or \emph{estimation} for each $x_i$ there is a \emph{prediction} or \emph{estimation}
$y^{est}_i$. For each of $x_i$ estimation and measurement will have a $y^{est}_i$ of the output value by the model. At each $x_i$ estimation
certain distance $y_i - y_i^{est}$. In our example the estimation is and measurement have a distance or error $y_i - y_i^{est}$. In our
given by the linear equation $y_i^{est} = f(x;m,b)$. The best fit of example the estimation is given by the equation $y_i^{est} =
the model with the parameters $m$ and $b$ leads to the minimal f(x;m,b)$. The best fitting model with parameters $m$ and $b$ is the
distances between observation $y_i$ and estimation $y_i^{est}$ one that minimizes the distances between observation $y_i$ and
(\figref{leastsquareerrorfig}). estimation $y_i^{est}$ (\figref{leastsquareerrorfig}).
We could require that the sum $\sum_{i=1}^N y_i - y^{est}_i$ is As a first guess we could simply minimize the sum $\sum_{i=1}^N y_i -
minimized. This approach, however, will not work since a minimal sum y^{est}_i$. This approach, however, will not work since a minimal sum
can also be achieved if half of the measurements is above and the can also be achieved if half of the measurements is above and the
other half below the predicted line. Positive and negative errors other half below the predicted line. Positive and negative errors
would cancel out and then sum up to values close to zero. A better would cancel out and then sum up to values close to zero. A better
approach is to consider the absolute value of the distance approach is to sum over the absolute values of the distances:
$\sum_{i=1}^N |y_i - y^{est}_i|$. The total error can only be small if $\sum_{i=1}^N |y_i - y^{est}_i|$. This sum can only be small if all
all deviations are indeed small no matter if they are above or below deviations are indeed small no matter if they are above or below the
the predicted line. Instead of the sum we could also ask for the predicted line. Instead of the sum we could also take the average
\emph{average}
\begin{equation} \begin{equation}
\label{meanabserror} \label{meanabserror}
f_{dist}(\{(x_i, y_i)\}|\{y^{est}_i\}) = \frac{1}{N} \sum_{i=1}^N |y_i - y^{est}_i| f_{dist}(\{(x_i, y_i)\}|\{y^{est}_i\}) = \frac{1}{N} \sum_{i=1}^N |y_i - y^{est}_i|
\end{equation} \end{equation}
should be small. Commonly, the \enterm{mean squared distance} or For reasons that are explained in
\enterm[square error!mean]{mean square error} (\determ[quadratischer chapter~\ref{maximumlikelihoodchapter}, instead of the averaged
Fehler!mittlerer]{mittlerer quadratischer Fehler}) absolute errors, the \enterm[mean squared error]{mean squared error}
(\determ[quadratischer Fehler!mittlerer]{mittlerer quadratischer
Fehler})
\begin{equation} \begin{equation}
\label{meansquarederror} \label{meansquarederror}
f_{mse}(\{(x_i, y_i)\}|\{y^{est}_i\}) = \frac{1}{N} \sum_{i=1}^N (y_i - y^{est}_i)^2 f_{mse}(\{(x_i, y_i)\}|\{y^{est}_i\}) = \frac{1}{N} \sum_{i=1}^N (y_i - y^{est}_i)^2
\end{equation} \end{equation}
is used (\figref{leastsquareerrorfig}). Similar to the absolute is commonly used (\figref{leastsquareerrorfig}). Similar to the
distance, the square of the error, $(y_i - y_i^{est})^2$, is always absolute distance, the square of the errors, $(y_i - y_i^{est})^2$, is
positive and thus error values do not cancel out. The square further always positive and thus positive and negative error values do not
punishes large deviations over small deviations. cancel each other out. In addition, the square punishes large
deviations over small deviations.
\begin{exercise}{meanSquareError.m}{}\label{mseexercise}%
Implement a function \varcode{meanSquareError()}, that calculates the \begin{exercise}{meanSquaredErrorLine.m}{}\label{mseexercise}%
\emph{mean square distance} between a vector of observations ($y$) Given a vector of observations \varcode{y} and a vector with the
and respective predictions ($y^{est}$). corresponding predictions \varcode{y\_est}, compute the \emph{mean
square error} between \varcode{y} and \varcode{y\_est} in a single
line of code.
\end{exercise} \end{exercise}
\section{\tr{Objective function}{Zielfunktion}} \section{Objective function}
$f_{cost}(\{(x_i, y_i)\}|\{y^{est}_i\})$ is a so called The mean squared error is a so called \enterm{objective function} or
\enterm{objective function} or \enterm{cost function} \enterm{cost function} (\determ{Kostenfunktion}), $f_{cost}(\{(x_i,
(\determ{Kostenfunktion}). We aim to adapt the model parameters to y_i)\}|\{y^{est}_i\})$. A cost function assigns to the given data set
minimize the error (mean square error) and thus the \emph{objective $\{(x_i, y_i)\}$ and corresponding model predictions $\{y^{est}_i\}$ a
function}. In Chapter~\ref{maximumlikelihoodchapter} we will show single scalar value that we want to minimize. Here we aim to adapt the
that the minimization of the mean square error is equivalent to model parameters to minimize the mean squared error
\eqref{meansquarederror}. In chapter~\ref{maximumlikelihoodchapter} we
show that the minimization of the mean square error is equivalent to
maximizing the likelihood that the observations originate from the maximizing the likelihood that the observations originate from the
model (assuming a normal distribution of the data around the model model (assuming a normal distribution of the data around the model
prediction). prediction). The \enterm{cost function} does not have to be the mean
square error but can be any function that maps the data and the
predictions to a scalar value describing the quality of the fit. In
the optimization process we aim for the paramter combination that
minimizes the costs.
\begin{figure}[t] \begin{figure}[t]
\includegraphics[width=1\textwidth]{linear_least_squares} \includegraphics[width=1\textwidth]{linear_least_squares}
@ -109,58 +123,44 @@ prediction).
\label{leastsquareerrorfig} \label{leastsquareerrorfig}
\end{figure} \end{figure}
The error or also \enterm{cost function} is not necessarily the mean Replacing $y^{est}$ with our model, the straight line
square distance but can be any function that maps the predictions to a \eqref{straightline}, yields
scalar value describing the quality of the fit. In the optimization
process we aim for the paramter combination that minimized the costs
(error).
%%% Einfaches verbales Beispiel? Eventuell aus der Populationsoekologie?
Replacing $y^{est}$ with the linear equation (the model) in
(\eqnref{meansquarederror}) we yield:
\begin{eqnarray} \begin{eqnarray}
f_{cost}(\{(x_i, y_i)\}|m,b) & = & \frac{1}{N} \sum_{i=1}^N (y_i - f(x_i;m,b))^2 \label{msefunc} \\ f_{cost}(\{(x_i, y_i)\}|m,b) & = & \frac{1}{N} \sum_{i=1}^N (y_i - f(x_i;m,b))^2 \label{msefunc} \\
& = & \frac{1}{N} \sum_{i=1}^N (y_i - m x_i - b)^2 \label{mseline} & = & \frac{1}{N} \sum_{i=1}^N (y_i - m x_i - b)^2 \label{mseline}
\end{eqnarray} \end{eqnarray}
That is, the mean square error is given by the pairs $(x_i, y_i)$ of
That is, the mean square error is given the pairs $(x_i, y_i)$ and the measurements and the parameters $m$ and $b$ of the straight line. The
parameters $m$ and $b$ of the linear equation. The optimization optimization process tries to find $m$ and $b$ such that the cost
process tries to optimize $m$ and $b$ such that the error is function is minimized. With the mean squared error as the cost
minimized, the method of the \enterm[square error!least]{least square function this optimization process is also called method of the
error} (\determ[quadratischer Fehler!kleinster]{Methode der \enterm{least square error} (\determ[quadratischer
kleinsten Quadrate}). Fehler!kleinster]{Methode der kleinsten Quadrate}).
\begin{exercise}{lsqError.m}{} \begin{exercise}{meanSquaredError.m}{}
Implement the objective function \varcode{lsqError()} that applies the Implement the objective function \varcode{meanSquaredError()} that
linear equation as a model. uses a straight line, \eqnref{straightline}, as a model. The
\begin{itemize} function takes three arguments. The first is a 2-element vector that
\item The function takes three arguments. The first is a 2-element contains the values of parameters \varcode{m} and \varcode{b}. The
vector that contains the values of parameters \varcode{m} and second is a vector of x-values, and the third contains the
\varcode{b}. The second is a vector of x-values the third contains measurements for each value of $x$, the respective $y$-values. The
the measurements for each value of $x$, the respecive $y$-values. function returns the mean square error \eqnref{mseline}.
\item The function returns the mean square error \eqnref{mseline}.
\item The function should call the function \varcode{meanSquareError()}
defined in the previouos exercise to calculate the error.
\end{itemize}
\end{exercise} \end{exercise}
\section{Error surface} \section{Error surface}
The two parameters of the model define a surface. For each combination For each combination of the two parameters $m$ and $b$ of the model we
of $m$ and $b$ we can use \eqnref{mseline} to calculate the associated can use \eqnref{mseline} to calculate the corresponding value of the
error. We thus consider the objective function $f_{cost}(\{(x_i, cost function. We thus consider the cost function $f_{cost}(\{(x_i,
y_i)\}|m,b)$ as a function $f_{cost}(m,b)$, that maps the variables y_i)\}|m,b)$ as a function $f_{cost}(m,b)$, that maps the parameter
$m$ and $b$ to an error value. values $m$ and $b$ to an error value. The error values describe a
landscape over the $m$-$b$ plane, the error surface, that can be
Thus, for each spot of the surface we get an error that we can illustrated graphically using a 3-d surface-plot. $m$ and $b$ are
illustrate graphically using a 3-d surface-plot, i.e. the error plotted on the $x-$ and $y-$ axis while the third dimension indicates
surface. $m$ and $b$ are plotted on the $x-$ and $y-$ axis while the the error value (\figref{errorsurfacefig}).
third dimension is used to indicate the error value
(\figref{errorsurfacefig}).
\begin{figure}[t] \begin{figure}[t]
\includegraphics[width=0.75\columnwidth]{error_surface} \includegraphics[width=0.75\textwidth]{error_surface}
\titlecaption{Error surface.}{The two model parameters $m$ and $b$ \titlecaption{Error surface.}{The two model parameters $m$ and $b$
define the base area of the surface plot. For each parameter define the base area of the surface plot. For each parameter
combination of slope and intercept the error is calculated. The combination of slope and intercept the error is calculated. The
@ -168,32 +168,36 @@ third dimension is used to indicate the error value
combination that best fits the data.}\label{errorsurfacefig} combination that best fits the data.}\label{errorsurfacefig}
\end{figure} \end{figure}
\begin{exercise}{errorSurface.m}{}\label{errorsurfaceexercise}% \begin{exercise}{errorSurface.m}{}\label{errorsurfaceexercise}
Load the dataset \textit{lin\_regression.mat} into the workspace (20 Generate 20 data pairs $(x_i|y_i)$ that are linearly related with
data pairs contained in the vectors \varcode{x} and slope $m=0.75$ and intercept $b=-40$, using \varcode{rand()} for
\varcode{y}). Implement a script \file{errorSurface.m}, that drawing $x$ values between 0 and 120 and \varcode{randn()} for
calculates the mean square error between data and a linear model and jittering the $y$ values with a standard deviation of 15. Then
illustrates the error surface using the \code{surf()} function calculate the mean squared error between the data and straight lines
(consult the help to find out how to use \code{surf()}.). for a range of slopes and intercepts using the
\varcode{meanSquaredError()} function from the previous exercise.
Illustrates the error surface using the \code{surface()} function
(consult the help to find out how to use \code{surface()}).
\end{exercise} \end{exercise}
By looking at the error surface we can directly see the position of By looking at the error surface we can directly see the position of
the minimum and thus estimate the optimal parameter combination. How the minimum and thus estimate the optimal parameter combination. How
can we use the error surface to guide an automatic optimization can we use the error surface to guide an automatic optimization
process. process?
The obvious approach would be to calculate the error surface and then The obvious approach would be to calculate the error surface and then
find the position of the minimum. The approach, however has several find the position of the minimum using the \code{min} function. This
disadvantages: (I) it is computationally very expensive to calculate approach, however has several disadvantages: (i) it is computationally
the error for each parameter combination. The number of combinations very expensive to calculate the error for each parameter
increases exponentially with the number of free parameters (also known combination. The number of combinations increases exponentially with
as the ``curse of dimensionality''). (II) the accuracy with which the the number of free parameters (also known as the ``curse of
best parameters can be estimated is limited by the resolution with dimensionality''). (ii) the accuracy with which the best parameters
which the parameter space was sampled. If the grid is too large, one can be estimated is limited by the resolution used to sample the
might miss the minimum. parameter space. The coarser the parameters are sampled the less
precise is the obtained position of the minimum.
We thus want a procedure that finds the minimum with a minimal number
of computations. We want a procedure that finds the minimum of the cost function with a minimal number
of computations and to arbitrary precision.
\begin{ibox}[t]{\label{differentialquotientbox}Difference quotient and derivative} \begin{ibox}[t]{\label{differentialquotientbox}Difference quotient and derivative}
\includegraphics[width=0.33\textwidth]{derivative} \includegraphics[width=0.33\textwidth]{derivative}
@ -217,13 +221,13 @@ of computations.
f'(x) = \frac{{\rm d} f(x)}{{\rm d}x} = \lim\limits_{\Delta x \to 0} \frac{f(x + \Delta x) - f(x)}{\Delta x} \end{equation} f'(x) = \frac{{\rm d} f(x)}{{\rm d}x} = \lim\limits_{\Delta x \to 0} \frac{f(x + \Delta x) - f(x)}{\Delta x} \end{equation}
\end{minipage}\vspace{2ex} \end{minipage}\vspace{2ex}
It is not possible to calculate this numerically It is not possible to calculate the derivative, \eqnref{derivative},
(\eqnref{derivative}). The derivative can only be estimated using numerically. The derivative can only be estimated using the
the difference quotient \eqnref{difffrac} by using sufficiently difference quotient, \eqnref{difffrac}, by using sufficiently small
small $\Delta x$. $\Delta x$.
\end{ibox} \end{ibox}
\begin{ibox}[t]{\label{partialderivativebox}Partial derivative and gradient} \begin{ibox}[tp]{\label{partialderivativebox}Partial derivative and gradient}
Some functions that depend on more than a single variable: Some functions that depend on more than a single variable:
\[ z = f(x,y) \] \[ z = f(x,y) \]
for example depends on $x$ and $y$. Using the partial derivative for example depends on $x$ and $y$. Using the partial derivative
@ -234,33 +238,33 @@ of computations.
individually by using the respective difference quotient individually by using the respective difference quotient
(Box~\ref{differentialquotientbox}). \vspace{1ex} (Box~\ref{differentialquotientbox}). \vspace{1ex}
\begin{minipage}[t]{0.44\textwidth} \begin{minipage}[t]{0.5\textwidth}
\mbox{}\\[-2ex] \mbox{}\\[-2ex]
\includegraphics[width=1\textwidth]{gradient} \includegraphics[width=1\textwidth]{gradient}
\end{minipage} \end{minipage}
\hfill \hfill
\begin{minipage}[t]{0.52\textwidth} \begin{minipage}[t]{0.46\textwidth}
For example, the partial derivatives of For example, the partial derivatives of
\[ f(x,y) = x^2+y^2 \] are \[ f(x,y) = x^2+y^2 \] are
\[ \frac{\partial f(x,y)}{\partial x} = 2x \; , \quad \frac{\partial f(x,y)}{\partial y} = 2y \; .\] \[ \frac{\partial f(x,y)}{\partial x} = 2x \; , \quad \frac{\partial f(x,y)}{\partial y} = 2y \; .\]
The gradient is a vector that constructed from the partial derivatives: The gradient is a vector that is constructed from the partial derivatives:
\[ \nabla f(x,y) = \left( \begin{array}{c} \frac{\partial f(x,y)}{\partial x} \\[1ex] \frac{\partial f(x,y)}{\partial y} \end{array} \right) \] \[ \nabla f(x,y) = \left( \begin{array}{c} \frac{\partial f(x,y)}{\partial x} \\[1ex] \frac{\partial f(x,y)}{\partial y} \end{array} \right) \]
This vector points into the direction of the strongest ascend of This vector points into the direction of the strongest ascend of
$f(x,y)$. $f(x,y)$.
\end{minipage} \end{minipage}
\vspace{1ex} The figure shows the contour lines of a bi-variate \vspace{0.5ex} The figure shows the contour lines of a bi-variate
Gaussian $f(x,y) = \exp(-(x^2+y^2)/2)$ and the gradient (thick Gaussian $f(x,y) = \exp(-(x^2+y^2)/2)$ and the gradient (thick
arrow) and the two partial derivatives (thin arrows) for three arrows) and the corresponding two partial derivatives (thin arrows)
different locations. for three different locations.
\end{ibox} \end{ibox}
\section{Gradient} \section{Gradient}
Imagine to place a small ball at some point on the error surface Imagine to place a small ball at some point on the error surface
\figref{errorsurfacefig}. Naturally, it would follow the steepest \figref{errorsurfacefig}. Naturally, it would roll down the steepest
slope and would stop at the minimum of the error surface (if it had no slope and eventually stop at the minimum of the error surface (if it had no
inertia). We will use this picture to develop an algorithm to find our inertia). We will use this picture to develop an algorithm to find our
way to the minimum of the objective function. The ball will always way to the minimum of the objective function. The ball will always
follow the steepest slope. Thus we need to figure out the direction of follow the steepest slope. Thus we need to figure out the direction of
@ -268,33 +272,31 @@ the steepest slope at the position of the ball.
The \entermde{Gradient}{gradient} (Box~\ref{partialderivativebox}) of the The \entermde{Gradient}{gradient} (Box~\ref{partialderivativebox}) of the
objective function is the vector objective function is the vector
\begin{equation}
\[ \nabla f_{cost}(m,b) = \left( \frac{\partial f(m,b)}{\partial m}, \label{gradient}
\frac{\partial f(m,b)}{\partial b} \right) \] \nabla f_{cost}(m,b) = \left( \frac{\partial f(m,b)}{\partial m},
\frac{\partial f(m,b)}{\partial b} \right)
that points to the strongest ascend of the objective function. Since \end{equation}
we want to reach the minimum we simply choose the opposite direction. that points to the strongest ascend of the objective function. The
gradient is given by partial derivatives
The gradient is given by partial derivatives (Box~\ref{partialderivativebox}) of the mean squared error with
(Box~\ref{partialderivativebox}) with respect to the parameters $m$ respect to the parameters $m$ and $b$ of the straight line. There is
and $b$ of the linear equation. There is no need to calculate it no need to calculate it analytically because it can be estimated from
analytically but it can be estimated from the partial derivatives the partial derivatives using the difference quotient
using the difference quotient (Box~\ref{differentialquotientbox}) for (Box~\ref{differentialquotientbox}) for small steps $\Delta m$ and
small steps $\Delta m$ und $\Delta b$. For example the partial $\Delta b$. For example, the partial derivative with respect to $m$
derivative with respect to $m$: can be computed as
\[\frac{\partial f_{cost}(m,b)}{\partial m} = \lim\limits_{\Delta m \to \[\frac{\partial f_{cost}(m,b)}{\partial m} = \lim\limits_{\Delta m \to
0} \frac{f_{cost}(m + \Delta m, b) - f_{cost}(m,b)}{\Delta m} 0} \frac{f_{cost}(m + \Delta m, b) - f_{cost}(m,b)}{\Delta m}
\approx \frac{f_{cost}(m + \Delta m, b) - f_{cost}(m,b)}{\Delta m} \; \approx \frac{f_{cost}(m + \Delta m, b) - f_{cost}(m,b)}{\Delta m} \;
. \] . \]
The length of the gradient indicates the steepness of the slope The length of the gradient indicates the steepness of the slope
(\figref{gradientquiverfig}). Since want to go down the hill, we (\figref{gradientquiverfig}). Since want to go down the hill, we
choose the opposite direction. choose the opposite direction.
\begin{figure}[t] \begin{figure}[t]
\includegraphics[width=0.75\columnwidth]{error_gradient} \includegraphics[width=0.75\textwidth]{error_gradient}
\titlecaption{Gradient of the error surface.} {Each arrow points \titlecaption{Gradient of the error surface.} {Each arrow points
into the direction of the greatest ascend at different positions into the direction of the greatest ascend at different positions
of the error surface shown in \figref{errorsurfacefig}. The of the error surface shown in \figref{errorsurfacefig}. The
@ -304,60 +306,60 @@ choose the opposite direction.
error.}\label{gradientquiverfig} error.}\label{gradientquiverfig}
\end{figure} \end{figure}
\begin{exercise}{lsqGradient.m}{}\label{gradientexercise}% \begin{exercise}{meanSquaredGradient.m}{}\label{gradientexercise}%
Implement a function \varcode{lsqGradient()}, that takes the set of Implement a function \varcode{meanSquaredGradient()}, that takes the
parameters $(m, b)$ of the linear equation as a two-element vector set of parameters $(m, b)$ of a straight line as a two-element
and the $x$- and $y$-data as input arguments. The function should vector and the $x$- and $y$-data as input arguments. The function
return the gradient at that position. should return the gradient at that position as a vector with two
elements.
\end{exercise} \end{exercise}
\begin{exercise}{errorGradient.m}{} \begin{exercise}{errorGradient.m}{}
Use the functions from the previous Extend the script of exercises~\ref{errorsurfaceexercise} to plot
exercises~\ref{errorsurfaceexercise} and~\ref{gradientexercise} to both the error surface and gradients using the
estimate and plot the error surface including the gradients. Choose \varcode{meanSquaredGradient()} function from
a subset of parameter combinations for which you plot the exercise~\ref{gradientexercise}. Vectors in space can be easily
gradient. Vectors in space can be easily plotted using the function plotted using the function \code{quiver()}. Use \code{contour()}
\code{quiver()}. instead of \code{surface()} to plot the error surface.
\end{exercise} \end{exercise}
\section{Gradient descent} \section{Gradient descent}
Finally, we are able to implement the optimization itself. By now it Finally, we are able to implement the optimization itself. By now it
should be obvious why it is called the gradient descent method. All should be obvious why it is called the gradient descent method. All
ingredients are already there. We need: 1. The error function ingredients are already there. We need: (i) the cost function
(\varcode{meanSquareError}), 2. the objective function (\varcode{meanSquaredError()}), and (ii) the gradient
(\varcode{lsqError()}), and 3. the gradient (\varcode{lsqGradient()}). The (\varcode{meanSquaredGradient()}). The algorithm of the gradient
algorithm of the gradient descent is: descent works as follows:
\begin{enumerate} \begin{enumerate}
\item Start with any given combination of the parameters $m$ and $b$ ($p_0 = (m_0, \item Start with some given combination of the parameters $m$ and $b$
b_0)$). ($p_0 = (m_0, b_0)$).
\item \label{computegradient} Calculate the gradient at the current \item \label{computegradient} Calculate the gradient at the current
position $p_i$. position $p_i$.
\item If the length of the gradient falls below a certain value, we \item If the length of the gradient falls below a certain value, we
assume to have reached the minimum and stop the search. We are assume to have reached the minimum and stop the search. We are
actually looking for the point at which the length of the gradient actually looking for the point at which the length of the gradient
is zero but finding zero is impossible for numerical reasons. We is zero, but finding zero is impossible because of numerical
thus apply a threshold below which we are sufficiently close to zero imprecision. We thus apply a threshold below which we are
(e.g. \varcode{norm(gradient) < 0.1}). sufficiently close to zero (e.g. \varcode{norm(gradient) < 0.1}).
\item \label{gradientstep} If the length of the gradient exceeds the \item \label{gradientstep} If the length of the gradient exceeds the
threshold we take a small step into the opposite direction threshold we take a small step into the opposite direction:
($\epsilon = 0.01$):
\[p_{i+1} = p_i - \epsilon \cdot \nabla f_{cost}(m_i, b_i)\] \[p_{i+1} = p_i - \epsilon \cdot \nabla f_{cost}(m_i, b_i)\]
\item Repeat steps \ref{computegradient} -- where $\epsilon = 0.01$ is a factor linking the gradient to
\ref{gradientstep}. appropriate steps in the parameter space.
\item Repeat steps \ref{computegradient} -- \ref{gradientstep}.
\end{enumerate} \end{enumerate}
\Figref{gradientdescentfig} illustrates the gradient descent (the path \Figref{gradientdescentfig} illustrates the gradient descent --- the
the imaginary ball has chosen to reach the minimum). Starting at an path the imaginary ball has chosen to reach the minimum. Starting at
arbitrary position on the error surface we change the position as long an arbitrary position on the error surface we change the position as
as the gradient at that position is larger than a certain long as the gradient at that position is larger than a certain
threshold. If the slope is very steep, the change in the position (the threshold. If the slope is very steep, the change in the position (the
distance between the red dots in \figref{gradientdescentfig}) is distance between the red dots in \figref{gradientdescentfig}) is
large. large.
\begin{figure}[t] \begin{figure}[t]
\includegraphics[width=0.6\columnwidth]{gradient_descent} \includegraphics[width=0.55\textwidth]{gradient_descent}
\titlecaption{Gradient descent.}{The algorithm starts at an \titlecaption{Gradient descent.}{The algorithm starts at an
arbitrary position. At each point the gradient is estimated and arbitrary position. At each point the gradient is estimated and
the position is updated as long as the length of the gradient is the position is updated as long as the length of the gradient is
@ -366,53 +368,56 @@ large.
\end{figure} \end{figure}
\begin{exercise}{gradientDescent.m}{} \begin{exercise}{gradientDescent.m}{}
Implement the gradient descent for the problem of the linear Implement the gradient descent for the problem of fitting a straight
equation for the measured data in file \file{lin\_regression.mat}. line to some measured data. Reuse the data generated in
exercise~\ref{errorsurfaceexercise}.
\begin{enumerate} \begin{enumerate}
\item Store for each iteration the error value. \item Store for each iteration the error value.
\item Create a plot that shows the error value as a function of the \item Plot the error values as a function of the iterations, the
number of optimization steps. number of optimization steps.
\item Create a plot that shows the measured data and the best fit. \item Plot the measured data together with the best fitting straight line.
\end{enumerate} \end{enumerate}
\end{exercise} \end{exercise}
\section{Summary} \section{Summary}
The gradient descent is an important method for solving optimization The gradient descent is an important numerical method for solving
problems. It is used to find the global minimum of an objective optimization problems. It is used to find the global minimum of an
function. objective function.
In the case of the linear equation the error surface (using the mean Curve fitting is a common application for the gradient descent method.
square error) shows a clearly defined minimum. The position of the For the case of fitting straight lines to data pairs, the error
minimum can be analytically calculated. The next chapter will surface (using the mean squared error) has exactly one clearly defined
introduce how this can be done without using the gradient descent global minimum. In fact, the position of the minimum can be analytically
\matlabfun{polyfit()}. calculated as shown in the next chapter.
Problems that involve nonlinear computations on parameters, e.g. the Problems that involve nonlinear computations on parameters, e.g. the
rate $\lambda$ in the exponential function $f(x;\lambda) = rate $\lambda$ in an exponential function $f(x;\lambda) = e^{\lambda
e^{\lambda x}$, do not have an analytical solution. To find minima x}$, do not have an analytical solution for the least squares. To
in such functions numerical methods such as the gradient descent have find the least squares for such functions numerical methods such as
to be applied. the gradient descent have to be applied.
The suggested gradient descent algorithm can be improved in multiple The suggested gradient descent algorithm can be improved in multiple
ways to converge faster. For example one could adapt the step size to ways to converge faster. For example one could adapt the step size to
the length of the gradient. These numerical tricks have already been the length of the gradient. These numerical tricks have already been
implemented in pre-defined functions. Generic optimization functions implemented in pre-defined functions. Generic optimization functions
such as \matlabfun{fminsearch()} have been implemented for arbitrary such as \matlabfun{fminsearch()} have been implemented for arbitrary
objective functions while more specialized functions are specifically objective functions, while the more specialized function
designed for optimizations in the least square error sense \matlabfun{lsqcurvefit()} i specifically designed for optimizations in
\matlabfun{lsqcurvefit()}. the least square error sense.
%\newpage %\newpage
\begin{important}[Beware of secondary minima!] \begin{important}[Beware of secondary minima!]
Finding the absolute minimum is not always as easy as in the case of Finding the absolute minimum is not always as easy as in the case of
the linear equation. Often, the error surface has secondary or local fitting a straight line. Often, the error surface has secondary or
minima in which the gradient descent stops even though there is a local minima in which the gradient descent stops even though there
more optimal solution. Starting from good start positions is a good is a more optimal solution, a global minimum that is lower than the
approach to avoid getting stuck in local minima. Further it is local minimum. Starting from good initial positions is a good
easier to optimize as few parameters as possible. Each additional approach to avoid getting stuck in local minima. Also keep in mind
parameter increases complexity and is computationally expensive. that error surfaces tend to be simpler (less local minima) the fewer
parameters are fitted from the data. Each additional parameter
increases complexity and is computationally more expensive.
\end{important} \end{important}