diff --git a/regression/code/errorGradient.m b/regression/code/errorGradient.m index a24367f..b9bb642 100644 --- a/regression/code/errorGradient.m +++ b/regression/code/errorGradient.m @@ -1,26 +1,33 @@ -load('lin_regression.mat') -ms = -1:0.5:5; -ns = -10:1:10; +% x, y, slopes, and intercepts from exercise 8.3 -error_surf = zeros(length(ms), length(ns)); -gradient_m = zeros(size(error_surf)); -gradient_n = zeros(size(error_surf)); +slopes = -5:0.25:5; +intercepts = -30:1:30; +error_surface = zeros(length(slopes), length(intercepts)); +for i = 1:length(slopes) + for j = 1:length(intercepts) + error_surf(i,j) = meanSquaredError([slopes(i), intercepts(j)], x, y); + end +end + +error_surface = zeros(length(slopes), length(intercepts)); +gradient_m = zeros(size(error_surface)); +gradient_b = zeros(size(error_surface)); -for i = 1:length(ms) - for j = 1:length(ns) - error_surf(i,j) = lsqError([ms(i), ns(j)], x, y); - grad = lsqGradient([ms(i), ns(j)], x, y); +for i = 1:length(slopes) + for j = 1:length(intercepts) + error_surface(i,j) = meanSquaredError([slopes(i), intercepts(j)], x, y); + grad = meanSquaredGradient([slopes(i), intercepts(j)], x, y); gradient_m(i,j) = grad(1); - gradient_n(i,j) = grad(2); + gradient_b(i,j) = grad(2); end end figure() hold on -[N, M] = meshgrid(ns, ms); -%surface(M,N, error_surf, 'FaceAlpha', 0.5); -contour(M,N, error_surf, 50); -quiver(M,N, gradient_m, gradient_n) +[N, M] = meshgrid(intercepts, slopes); +%surface(M, N, error_surface, 'FaceAlpha', 0.5); +contour(M, N, error_surface, 50); +quiver(M, N, gradient_m, gradient_b) xlabel('Slope m') ylabel('Intercept b') zlabel('Mean squared error') diff --git a/regression/code/errorSurface.m b/regression/code/errorSurface.m index 0e88c1e..f86e3b1 100644 --- a/regression/code/errorSurface.m +++ b/regression/code/errorSurface.m @@ -1,19 +1,24 @@ -load('lin_regression.mat'); +% generate data: +m = 0.75; +b = -40.0; +n = 20; +x = 120.0*rand(n, 1); +y = m*x + b + 15.0*randn(n, 1); % compute mean squared error for a range of slopes and intercepts: slopes = -5:0.25:5; intercepts = -30:1:30; -error_surf = zeros(length(slopes), length(intercepts)); +error_surface = zeros(length(slopes), length(intercepts)); for i = 1:length(slopes) for j = 1:length(intercepts) - error_surf(i,j) = lsqError([slopes(i), intercepts(j)], x, y); + error_surf(i,j) = meanSquaredError([slopes(i), intercepts(j)], x, y); end end % plot the error surface: figure() [N,M] = meshgrid(intercepts, slopes); -s = surface(M, N, error_surf); +surface(M, N, error_surface); xlabel('slope', 'rotation', 7.5) ylabel('intercept', 'rotation', -22.5) zlabel('error') diff --git a/regression/code/gradientDescent.m b/regression/code/gradientDescent.m index d4d18b9..a482279 100644 --- a/regression/code/gradientDescent.m +++ b/regression/code/gradientDescent.m @@ -1,26 +1,27 @@ -clear -close all -load('lin_regression.mat') +% x, y from exercise 8.3 -position = [-2. 10.]; +% some arbitrary values for the slope and the intercept to start with: +position = [-2. 10.]; + +% gradient descent: gradient = []; errors = []; count = 1; eps = 0.01; - while isempty(gradient) || norm(gradient) > 0.1 - gradient = lsqGradient(position, x,y); - errors(count) = lsqError(position, x, y); + gradient = meanSquaredGradient(position, x, y); + errors(count) = meanSquaredError(position, x, y); position = position - eps .* gradient; count = count + 1; end + figure() subplot(2,1,1) hold on -scatter(x,y, 'displayname', 'data') -xaxis = min(x):0.01:max(x); -f_x = position(1).*xaxis + position(2); -plot(xaxis, f_x, 'displayname', 'fit') +scatter(x, y, 'displayname', 'data') +xx = min(x):0.01:max(x); +yy = position(1).*xx + position(2); +plot(xx, yy, 'displayname', 'fit') xlabel('Input') ylabel('Output') grid on @@ -28,4 +29,4 @@ legend show subplot(2,1,2) plot(errors) xlabel('optimization steps') -ylabel('error') \ No newline at end of file +ylabel('error') diff --git a/regression/code/lsqError.m b/regression/code/lsqError.m deleted file mode 100644 index 3547266..0000000 --- a/regression/code/lsqError.m +++ /dev/null @@ -1,13 +0,0 @@ -function error = lsqError(parameter, x, y) -% Objective function for fitting a linear equation to data. -% -% Arguments: parameter, vector containing slope and intercept -% as the 1st and 2nd element -% x, vector of the input values -% y, vector of the corresponding measured output values -% -% Returns: the estimation error in terms of the mean sqaure error - - y_est = x .* parameter(1) + parameter(2); - error = meanSquareError(y, y_est); -end diff --git a/regression/code/meanSquareError.m b/regression/code/meanSquareError.m deleted file mode 100644 index 253d31a..0000000 --- a/regression/code/meanSquareError.m +++ /dev/null @@ -1,10 +0,0 @@ -function error = meanSquareError(y, y_est) -% Mean squared error between observed and predicted values. -% -% Arguments: y, vector of observed values. -% y_est, vector of predicted values. -% -% Returns: the error in the mean-squared-deviation sense. - - error = mean((y - y_est).^2); -end diff --git a/regression/code/meanSquaredError.m b/regression/code/meanSquaredError.m new file mode 100644 index 0000000..cb21324 --- /dev/null +++ b/regression/code/meanSquaredError.m @@ -0,0 +1,12 @@ +function mse = meanSquaredError(parameter, x, y) +% Mean squared error between a straight line and data pairs. +% +% Arguments: parameter, vector containing slope and intercept + % as the 1st and 2nd element, respectively. +% x, vector of the input values +% y, vector of the corresponding measured output values +% +% Returns: mse, the mean-squared-error. + + mse = mean((y - x * parameter(1) - parameter(2)).^2); +end diff --git a/regression/code/meanSquaredErrorLine.m b/regression/code/meanSquaredErrorLine.m new file mode 100644 index 0000000..c395934 --- /dev/null +++ b/regression/code/meanSquaredErrorLine.m @@ -0,0 +1 @@ +mse = mean((y - y_est).^2); diff --git a/regression/code/lsqGradient.m b/regression/code/meanSquaredGradient.m similarity index 54% rename from regression/code/lsqGradient.m rename to regression/code/meanSquaredGradient.m index 39bfbbb..9817b0f 100644 --- a/regression/code/lsqGradient.m +++ b/regression/code/meanSquaredGradient.m @@ -1,5 +1,5 @@ -function gradient = lsqGradient(parameter, x, y) -% The gradient of the least square error +function gradient = meanSquaredGradient(parameter, x, y) +% The gradient of the mean squared error % % Arguments: parameter, vector containing slope and intercept % as the 1st and 2nd element @@ -7,8 +7,10 @@ function gradient = lsqGradient(parameter, x, y) % y, vector of the corresponding measured output values % % Returns: the gradient as a vector with two elements + h = 1e-6; % stepsize for derivatives - partial_m = (lsqError([parameter(1)+h, parameter(2)], x, y) - lsqError(parameter, x, y))/ h; - partial_n = (lsqError([parameter(1), parameter(2)+h], x, y) - lsqError(parameter, x, y))/ h; + mse = meanSquaredError(parameter, x, y); + partial_m = (meanSquaredError([parameter(1)+h, parameter(2)], x, y) - mse)/h; + partial_n = (meanSquaredError([parameter(1), parameter(2)+h], x, y) - mse)/h; gradient = [partial_m, partial_n]; end diff --git a/regression/lecture/regression.tex b/regression/lecture/regression.tex index 1111796..efc490e 100644 --- a/regression/lecture/regression.tex +++ b/regression/lecture/regression.tex @@ -18,6 +18,8 @@ the response values is minimized. One basic numerical method used for such optimization problems is the so called gradient descent, which is introduced in this chapter. +%%% Weiteres einfaches verbales Beispiel? Eventuell aus der Populationsoekologie? + \begin{figure}[t] \includegraphics[width=1\textwidth]{lin_regress}\hfill \titlecaption{Example data suggesting a linear relation.}{A set of @@ -31,7 +33,10 @@ introduced in this chapter. The data plotted in \figref{linregressiondatafig} suggest a linear relation between input and output of the system. We thus assume that a straight line -\[y = f(x; m, b) = m\cdot x + b \] +\begin{equation} + \label{straightline} + y = f(x; m, b) = m\cdot x + b +\end{equation} is an appropriate model to describe the system. The line has two free parameter, the slope $m$ and the $y$-intercept $b$. We need to find values for the slope and the intercept that best describe the measured @@ -45,59 +50,68 @@ slope and intercept that best describes the system. Before the optimization can be done we need to specify what exactly is considered an optimal fit. In our example we search the parameter combination that describe the relation of $x$ and $y$ best. What is -meant by this? Each input $x_i$ leads to an output $y_i$ and for each -$x_i$ there is a \emph{prediction} or \emph{estimation} -$y^{est}_i$. For each of $x_i$ estimation and measurement will have a -certain distance $y_i - y_i^{est}$. In our example the estimation is -given by the linear equation $y_i^{est} = f(x;m,b)$. The best fit of -the model with the parameters $m$ and $b$ leads to the minimal -distances between observation $y_i$ and estimation $y_i^{est}$ -(\figref{leastsquareerrorfig}). - -We could require that the sum $\sum_{i=1}^N y_i - y^{est}_i$ is -minimized. This approach, however, will not work since a minimal sum +meant by this? Each input $x_i$ leads to an measured output $y_i$ and +for each $x_i$ there is a \emph{prediction} or \emph{estimation} +$y^{est}_i$ of the output value by the model. At each $x_i$ estimation +and measurement have a distance or error $y_i - y_i^{est}$. In our +example the estimation is given by the equation $y_i^{est} = +f(x;m,b)$. The best fitting model with parameters $m$ and $b$ is the +one that minimizes the distances between observation $y_i$ and +estimation $y_i^{est}$ (\figref{leastsquareerrorfig}). + +As a first guess we could simply minimize the sum $\sum_{i=1}^N y_i - +y^{est}_i$. This approach, however, will not work since a minimal sum can also be achieved if half of the measurements is above and the other half below the predicted line. Positive and negative errors would cancel out and then sum up to values close to zero. A better -approach is to consider the absolute value of the distance -$\sum_{i=1}^N |y_i - y^{est}_i|$. The total error can only be small if -all deviations are indeed small no matter if they are above or below -the predicted line. Instead of the sum we could also ask for the -\emph{average} +approach is to sum over the absolute values of the distances: +$\sum_{i=1}^N |y_i - y^{est}_i|$. This sum can only be small if all +deviations are indeed small no matter if they are above or below the +predicted line. Instead of the sum we could also take the average \begin{equation} \label{meanabserror} f_{dist}(\{(x_i, y_i)\}|\{y^{est}_i\}) = \frac{1}{N} \sum_{i=1}^N |y_i - y^{est}_i| \end{equation} -should be small. Commonly, the \enterm{mean squared distance} or -\enterm[square error!mean]{mean square error} (\determ[quadratischer -Fehler!mittlerer]{mittlerer quadratischer Fehler}) +For reasons that are explained in +chapter~\ref{maximumlikelihoodchapter}, instead of the averaged +absolute errors, the \enterm[mean squared error]{mean squared error} +(\determ[quadratischer Fehler!mittlerer]{mittlerer quadratischer + Fehler}) \begin{equation} \label{meansquarederror} f_{mse}(\{(x_i, y_i)\}|\{y^{est}_i\}) = \frac{1}{N} \sum_{i=1}^N (y_i - y^{est}_i)^2 \end{equation} -is used (\figref{leastsquareerrorfig}). Similar to the absolute -distance, the square of the error, $(y_i - y_i^{est})^2$, is always -positive and thus error values do not cancel out. The square further -punishes large deviations over small deviations. - -\begin{exercise}{meanSquareError.m}{}\label{mseexercise}% - Implement a function \varcode{meanSquareError()}, that calculates the - \emph{mean square distance} between a vector of observations ($y$) - and respective predictions ($y^{est}$). +is commonly used (\figref{leastsquareerrorfig}). Similar to the +absolute distance, the square of the errors, $(y_i - y_i^{est})^2$, is +always positive and thus positive and negative error values do not +cancel each other out. In addition, the square punishes large +deviations over small deviations. + +\begin{exercise}{meanSquaredErrorLine.m}{}\label{mseexercise}% + Given a vector of observations \varcode{y} and a vector with the + corresponding predictions \varcode{y\_est}, compute the \emph{mean + square error} between \varcode{y} and \varcode{y\_est} in a single + line of code. \end{exercise} -\section{\tr{Objective function}{Zielfunktion}} +\section{Objective function} -$f_{cost}(\{(x_i, y_i)\}|\{y^{est}_i\})$ is a so called -\enterm{objective function} or \enterm{cost function} -(\determ{Kostenfunktion}). We aim to adapt the model parameters to -minimize the error (mean square error) and thus the \emph{objective - function}. In Chapter~\ref{maximumlikelihoodchapter} we will show -that the minimization of the mean square error is equivalent to +The mean squared error is a so called \enterm{objective function} or +\enterm{cost function} (\determ{Kostenfunktion}), $f_{cost}(\{(x_i, +y_i)\}|\{y^{est}_i\})$. A cost function assigns to the given data set +$\{(x_i, y_i)\}$ and corresponding model predictions $\{y^{est}_i\}$ a +single scalar value that we want to minimize. Here we aim to adapt the +model parameters to minimize the mean squared error +\eqref{meansquarederror}. In chapter~\ref{maximumlikelihoodchapter} we +show that the minimization of the mean square error is equivalent to maximizing the likelihood that the observations originate from the model (assuming a normal distribution of the data around the model -prediction). +prediction). The \enterm{cost function} does not have to be the mean +square error but can be any function that maps the data and the +predictions to a scalar value describing the quality of the fit. In +the optimization process we aim for the paramter combination that +minimizes the costs. \begin{figure}[t] \includegraphics[width=1\textwidth]{linear_least_squares} @@ -109,58 +123,44 @@ prediction). \label{leastsquareerrorfig} \end{figure} -The error or also \enterm{cost function} is not necessarily the mean -square distance but can be any function that maps the predictions to a -scalar value describing the quality of the fit. In the optimization -process we aim for the paramter combination that minimized the costs -(error). - -%%% Einfaches verbales Beispiel? Eventuell aus der Populationsoekologie? -Replacing $y^{est}$ with the linear equation (the model) in -(\eqnref{meansquarederror}) we yield: - +Replacing $y^{est}$ with our model, the straight line +\eqref{straightline}, yields \begin{eqnarray} f_{cost}(\{(x_i, y_i)\}|m,b) & = & \frac{1}{N} \sum_{i=1}^N (y_i - f(x_i;m,b))^2 \label{msefunc} \\ & = & \frac{1}{N} \sum_{i=1}^N (y_i - m x_i - b)^2 \label{mseline} \end{eqnarray} - -That is, the mean square error is given the pairs $(x_i, y_i)$ and the -parameters $m$ and $b$ of the linear equation. The optimization -process tries to optimize $m$ and $b$ such that the error is -minimized, the method of the \enterm[square error!least]{least square - error} (\determ[quadratischer Fehler!kleinster]{Methode der - kleinsten Quadrate}). - -\begin{exercise}{lsqError.m}{} - Implement the objective function \varcode{lsqError()} that applies the - linear equation as a model. - \begin{itemize} - \item The function takes three arguments. The first is a 2-element - vector that contains the values of parameters \varcode{m} and - \varcode{b}. The second is a vector of x-values the third contains - the measurements for each value of $x$, the respecive $y$-values. - \item The function returns the mean square error \eqnref{mseline}. - \item The function should call the function \varcode{meanSquareError()} - defined in the previouos exercise to calculate the error. - \end{itemize} +That is, the mean square error is given by the pairs $(x_i, y_i)$ of +measurements and the parameters $m$ and $b$ of the straight line. The +optimization process tries to find $m$ and $b$ such that the cost +function is minimized. With the mean squared error as the cost +function this optimization process is also called method of the +\enterm{least square error} (\determ[quadratischer +Fehler!kleinster]{Methode der kleinsten Quadrate}). + +\begin{exercise}{meanSquaredError.m}{} + Implement the objective function \varcode{meanSquaredError()} that + uses a straight line, \eqnref{straightline}, as a model. The + function takes three arguments. The first is a 2-element vector that + contains the values of parameters \varcode{m} and \varcode{b}. The + second is a vector of x-values, and the third contains the + measurements for each value of $x$, the respective $y$-values. The + function returns the mean square error \eqnref{mseline}. \end{exercise} \section{Error surface} -The two parameters of the model define a surface. For each combination -of $m$ and $b$ we can use \eqnref{mseline} to calculate the associated -error. We thus consider the objective function $f_{cost}(\{(x_i, -y_i)\}|m,b)$ as a function $f_{cost}(m,b)$, that maps the variables -$m$ and $b$ to an error value. - -Thus, for each spot of the surface we get an error that we can -illustrate graphically using a 3-d surface-plot, i.e. the error -surface. $m$ and $b$ are plotted on the $x-$ and $y-$ axis while the -third dimension is used to indicate the error value -(\figref{errorsurfacefig}). +For each combination of the two parameters $m$ and $b$ of the model we +can use \eqnref{mseline} to calculate the corresponding value of the +cost function. We thus consider the cost function $f_{cost}(\{(x_i, +y_i)\}|m,b)$ as a function $f_{cost}(m,b)$, that maps the parameter +values $m$ and $b$ to an error value. The error values describe a +landscape over the $m$-$b$ plane, the error surface, that can be +illustrated graphically using a 3-d surface-plot. $m$ and $b$ are +plotted on the $x-$ and $y-$ axis while the third dimension indicates +the error value (\figref{errorsurfacefig}). \begin{figure}[t] - \includegraphics[width=0.75\columnwidth]{error_surface} + \includegraphics[width=0.75\textwidth]{error_surface} \titlecaption{Error surface.}{The two model parameters $m$ and $b$ define the base area of the surface plot. For each parameter combination of slope and intercept the error is calculated. The @@ -168,32 +168,36 @@ third dimension is used to indicate the error value combination that best fits the data.}\label{errorsurfacefig} \end{figure} -\begin{exercise}{errorSurface.m}{}\label{errorsurfaceexercise}% - Load the dataset \textit{lin\_regression.mat} into the workspace (20 - data pairs contained in the vectors \varcode{x} and - \varcode{y}). Implement a script \file{errorSurface.m}, that - calculates the mean square error between data and a linear model and - illustrates the error surface using the \code{surf()} function - (consult the help to find out how to use \code{surf()}.). +\begin{exercise}{errorSurface.m}{}\label{errorsurfaceexercise} + Generate 20 data pairs $(x_i|y_i)$ that are linearly related with + slope $m=0.75$ and intercept $b=-40$, using \varcode{rand()} for + drawing $x$ values between 0 and 120 and \varcode{randn()} for + jittering the $y$ values with a standard deviation of 15. Then + calculate the mean squared error between the data and straight lines + for a range of slopes and intercepts using the + \varcode{meanSquaredError()} function from the previous exercise. + Illustrates the error surface using the \code{surface()} function + (consult the help to find out how to use \code{surface()}). \end{exercise} By looking at the error surface we can directly see the position of the minimum and thus estimate the optimal parameter combination. How can we use the error surface to guide an automatic optimization -process. +process? The obvious approach would be to calculate the error surface and then -find the position of the minimum. The approach, however has several -disadvantages: (I) it is computationally very expensive to calculate -the error for each parameter combination. The number of combinations -increases exponentially with the number of free parameters (also known -as the ``curse of dimensionality''). (II) the accuracy with which the -best parameters can be estimated is limited by the resolution with -which the parameter space was sampled. If the grid is too large, one -might miss the minimum. - -We thus want a procedure that finds the minimum with a minimal number -of computations. +find the position of the minimum using the \code{min} function. This +approach, however has several disadvantages: (i) it is computationally +very expensive to calculate the error for each parameter +combination. The number of combinations increases exponentially with +the number of free parameters (also known as the ``curse of +dimensionality''). (ii) the accuracy with which the best parameters +can be estimated is limited by the resolution used to sample the +parameter space. The coarser the parameters are sampled the less +precise is the obtained position of the minimum. + +We want a procedure that finds the minimum of the cost function with a minimal number +of computations and to arbitrary precision. \begin{ibox}[t]{\label{differentialquotientbox}Difference quotient and derivative} \includegraphics[width=0.33\textwidth]{derivative} @@ -217,13 +221,13 @@ of computations. f'(x) = \frac{{\rm d} f(x)}{{\rm d}x} = \lim\limits_{\Delta x \to 0} \frac{f(x + \Delta x) - f(x)}{\Delta x} \end{equation} \end{minipage}\vspace{2ex} - It is not possible to calculate this numerically - (\eqnref{derivative}). The derivative can only be estimated using - the difference quotient \eqnref{difffrac} by using sufficiently - small $\Delta x$. + It is not possible to calculate the derivative, \eqnref{derivative}, + numerically. The derivative can only be estimated using the + difference quotient, \eqnref{difffrac}, by using sufficiently small + $\Delta x$. \end{ibox} -\begin{ibox}[t]{\label{partialderivativebox}Partial derivative and gradient} +\begin{ibox}[tp]{\label{partialderivativebox}Partial derivative and gradient} Some functions that depend on more than a single variable: \[ z = f(x,y) \] for example depends on $x$ and $y$. Using the partial derivative @@ -234,33 +238,33 @@ of computations. individually by using the respective difference quotient (Box~\ref{differentialquotientbox}). \vspace{1ex} - \begin{minipage}[t]{0.44\textwidth} + \begin{minipage}[t]{0.5\textwidth} \mbox{}\\[-2ex] \includegraphics[width=1\textwidth]{gradient} \end{minipage} \hfill - \begin{minipage}[t]{0.52\textwidth} + \begin{minipage}[t]{0.46\textwidth} For example, the partial derivatives of \[ f(x,y) = x^2+y^2 \] are \[ \frac{\partial f(x,y)}{\partial x} = 2x \; , \quad \frac{\partial f(x,y)}{\partial y} = 2y \; .\] - The gradient is a vector that constructed from the partial derivatives: + The gradient is a vector that is constructed from the partial derivatives: \[ \nabla f(x,y) = \left( \begin{array}{c} \frac{\partial f(x,y)}{\partial x} \\[1ex] \frac{\partial f(x,y)}{\partial y} \end{array} \right) \] This vector points into the direction of the strongest ascend of $f(x,y)$. \end{minipage} - \vspace{1ex} The figure shows the contour lines of a bi-variate + \vspace{0.5ex} The figure shows the contour lines of a bi-variate Gaussian $f(x,y) = \exp(-(x^2+y^2)/2)$ and the gradient (thick - arrow) and the two partial derivatives (thin arrows) for three - different locations. + arrows) and the corresponding two partial derivatives (thin arrows) + for three different locations. \end{ibox} \section{Gradient} Imagine to place a small ball at some point on the error surface -\figref{errorsurfacefig}. Naturally, it would follow the steepest -slope and would stop at the minimum of the error surface (if it had no +\figref{errorsurfacefig}. Naturally, it would roll down the steepest +slope and eventually stop at the minimum of the error surface (if it had no inertia). We will use this picture to develop an algorithm to find our way to the minimum of the objective function. The ball will always follow the steepest slope. Thus we need to figure out the direction of @@ -268,33 +272,31 @@ the steepest slope at the position of the ball. The \entermde{Gradient}{gradient} (Box~\ref{partialderivativebox}) of the objective function is the vector - -\[ \nabla f_{cost}(m,b) = \left( \frac{\partial f(m,b)}{\partial m}, -\frac{\partial f(m,b)}{\partial b} \right) \] - -that points to the strongest ascend of the objective function. Since -we want to reach the minimum we simply choose the opposite direction. - -The gradient is given by partial derivatives -(Box~\ref{partialderivativebox}) with respect to the parameters $m$ -and $b$ of the linear equation. There is no need to calculate it -analytically but it can be estimated from the partial derivatives -using the difference quotient (Box~\ref{differentialquotientbox}) for -small steps $\Delta m$ und $\Delta b$. For example the partial -derivative with respect to $m$: - +\begin{equation} + \label{gradient} + \nabla f_{cost}(m,b) = \left( \frac{\partial f(m,b)}{\partial m}, + \frac{\partial f(m,b)}{\partial b} \right) +\end{equation} +that points to the strongest ascend of the objective function. The +gradient is given by partial derivatives +(Box~\ref{partialderivativebox}) of the mean squared error with +respect to the parameters $m$ and $b$ of the straight line. There is +no need to calculate it analytically because it can be estimated from +the partial derivatives using the difference quotient +(Box~\ref{differentialquotientbox}) for small steps $\Delta m$ and +$\Delta b$. For example, the partial derivative with respect to $m$ +can be computed as \[\frac{\partial f_{cost}(m,b)}{\partial m} = \lim\limits_{\Delta m \to 0} \frac{f_{cost}(m + \Delta m, b) - f_{cost}(m,b)}{\Delta m} \approx \frac{f_{cost}(m + \Delta m, b) - f_{cost}(m,b)}{\Delta m} \; . \] - The length of the gradient indicates the steepness of the slope (\figref{gradientquiverfig}). Since want to go down the hill, we choose the opposite direction. \begin{figure}[t] - \includegraphics[width=0.75\columnwidth]{error_gradient} + \includegraphics[width=0.75\textwidth]{error_gradient} \titlecaption{Gradient of the error surface.} {Each arrow points into the direction of the greatest ascend at different positions of the error surface shown in \figref{errorsurfacefig}. The @@ -304,60 +306,60 @@ choose the opposite direction. error.}\label{gradientquiverfig} \end{figure} -\begin{exercise}{lsqGradient.m}{}\label{gradientexercise}% - Implement a function \varcode{lsqGradient()}, that takes the set of - parameters $(m, b)$ of the linear equation as a two-element vector - and the $x$- and $y$-data as input arguments. The function should - return the gradient at that position. +\begin{exercise}{meanSquaredGradient.m}{}\label{gradientexercise}% + Implement a function \varcode{meanSquaredGradient()}, that takes the + set of parameters $(m, b)$ of a straight line as a two-element + vector and the $x$- and $y$-data as input arguments. The function + should return the gradient at that position as a vector with two + elements. \end{exercise} \begin{exercise}{errorGradient.m}{} - Use the functions from the previous - exercises~\ref{errorsurfaceexercise} and~\ref{gradientexercise} to - estimate and plot the error surface including the gradients. Choose - a subset of parameter combinations for which you plot the - gradient. Vectors in space can be easily plotted using the function - \code{quiver()}. + Extend the script of exercises~\ref{errorsurfaceexercise} to plot + both the error surface and gradients using the + \varcode{meanSquaredGradient()} function from + exercise~\ref{gradientexercise}. Vectors in space can be easily + plotted using the function \code{quiver()}. Use \code{contour()} + instead of \code{surface()} to plot the error surface. \end{exercise} \section{Gradient descent} Finally, we are able to implement the optimization itself. By now it should be obvious why it is called the gradient descent method. All -ingredients are already there. We need: 1. The error function -(\varcode{meanSquareError}), 2. the objective function -(\varcode{lsqError()}), and 3. the gradient (\varcode{lsqGradient()}). The -algorithm of the gradient descent is: - +ingredients are already there. We need: (i) the cost function +(\varcode{meanSquaredError()}), and (ii) the gradient +(\varcode{meanSquaredGradient()}). The algorithm of the gradient +descent works as follows: \begin{enumerate} -\item Start with any given combination of the parameters $m$ and $b$ ($p_0 = (m_0, - b_0)$). +\item Start with some given combination of the parameters $m$ and $b$ + ($p_0 = (m_0, b_0)$). \item \label{computegradient} Calculate the gradient at the current position $p_i$. \item If the length of the gradient falls below a certain value, we assume to have reached the minimum and stop the search. We are actually looking for the point at which the length of the gradient - is zero but finding zero is impossible for numerical reasons. We - thus apply a threshold below which we are sufficiently close to zero - (e.g. \varcode{norm(gradient) < 0.1}). + is zero, but finding zero is impossible because of numerical + imprecision. We thus apply a threshold below which we are + sufficiently close to zero (e.g. \varcode{norm(gradient) < 0.1}). \item \label{gradientstep} If the length of the gradient exceeds the - threshold we take a small step into the opposite direction - ($\epsilon = 0.01$): + threshold we take a small step into the opposite direction: \[p_{i+1} = p_i - \epsilon \cdot \nabla f_{cost}(m_i, b_i)\] -\item Repeat steps \ref{computegradient} -- - \ref{gradientstep}. + where $\epsilon = 0.01$ is a factor linking the gradient to + appropriate steps in the parameter space. +\item Repeat steps \ref{computegradient} -- \ref{gradientstep}. \end{enumerate} -\Figref{gradientdescentfig} illustrates the gradient descent (the path -the imaginary ball has chosen to reach the minimum). Starting at an -arbitrary position on the error surface we change the position as long -as the gradient at that position is larger than a certain +\Figref{gradientdescentfig} illustrates the gradient descent --- the +path the imaginary ball has chosen to reach the minimum. Starting at +an arbitrary position on the error surface we change the position as +long as the gradient at that position is larger than a certain threshold. If the slope is very steep, the change in the position (the distance between the red dots in \figref{gradientdescentfig}) is large. \begin{figure}[t] - \includegraphics[width=0.6\columnwidth]{gradient_descent} + \includegraphics[width=0.55\textwidth]{gradient_descent} \titlecaption{Gradient descent.}{The algorithm starts at an arbitrary position. At each point the gradient is estimated and the position is updated as long as the length of the gradient is @@ -366,53 +368,56 @@ large. \end{figure} \begin{exercise}{gradientDescent.m}{} - Implement the gradient descent for the problem of the linear - equation for the measured data in file \file{lin\_regression.mat}. + Implement the gradient descent for the problem of fitting a straight + line to some measured data. Reuse the data generated in + exercise~\ref{errorsurfaceexercise}. \begin{enumerate} \item Store for each iteration the error value. - \item Create a plot that shows the error value as a function of the + \item Plot the error values as a function of the iterations, the number of optimization steps. - \item Create a plot that shows the measured data and the best fit. + \item Plot the measured data together with the best fitting straight line. \end{enumerate} \end{exercise} \section{Summary} -The gradient descent is an important method for solving optimization -problems. It is used to find the global minimum of an objective -function. +The gradient descent is an important numerical method for solving +optimization problems. It is used to find the global minimum of an +objective function. -In the case of the linear equation the error surface (using the mean -square error) shows a clearly defined minimum. The position of the -minimum can be analytically calculated. The next chapter will -introduce how this can be done without using the gradient descent -\matlabfun{polyfit()}. +Curve fitting is a common application for the gradient descent method. +For the case of fitting straight lines to data pairs, the error +surface (using the mean squared error) has exactly one clearly defined +global minimum. In fact, the position of the minimum can be analytically +calculated as shown in the next chapter. Problems that involve nonlinear computations on parameters, e.g. the -rate $\lambda$ in the exponential function $f(x;\lambda) = -e^{\lambda x}$, do not have an analytical solution. To find minima -in such functions numerical methods such as the gradient descent have -to be applied. +rate $\lambda$ in an exponential function $f(x;\lambda) = e^{\lambda + x}$, do not have an analytical solution for the least squares. To +find the least squares for such functions numerical methods such as +the gradient descent have to be applied. The suggested gradient descent algorithm can be improved in multiple ways to converge faster. For example one could adapt the step size to the length of the gradient. These numerical tricks have already been implemented in pre-defined functions. Generic optimization functions such as \matlabfun{fminsearch()} have been implemented for arbitrary -objective functions while more specialized functions are specifically -designed for optimizations in the least square error sense -\matlabfun{lsqcurvefit()}. +objective functions, while the more specialized function +\matlabfun{lsqcurvefit()} i specifically designed for optimizations in +the least square error sense. %\newpage \begin{important}[Beware of secondary minima!] Finding the absolute minimum is not always as easy as in the case of - the linear equation. Often, the error surface has secondary or local - minima in which the gradient descent stops even though there is a - more optimal solution. Starting from good start positions is a good - approach to avoid getting stuck in local minima. Further it is - easier to optimize as few parameters as possible. Each additional - parameter increases complexity and is computationally expensive. + fitting a straight line. Often, the error surface has secondary or + local minima in which the gradient descent stops even though there + is a more optimal solution, a global minimum that is lower than the + local minimum. Starting from good initial positions is a good + approach to avoid getting stuck in local minima. Also keep in mind + that error surfaces tend to be simpler (less local minima) the fewer + parameters are fitted from the data. Each additional parameter + increases complexity and is computationally more expensive. \end{important}