From 46affef86d61bd4ae301979bb46ba326a8e676c2 Mon Sep 17 00:00:00 2001 From: Jan Benda Date: Sat, 19 Dec 2020 22:56:09 +0100 Subject: [PATCH] [regression] finished main text and exercises --- regression/code/gradientDescent.m | 66 ++++++---- regression/code/gradientDescentPower.m | 41 ------ regression/code/plotgradientdescentpower.m | 4 +- regression/lecture/regression-chapter.tex | 74 ----------- regression/lecture/regression.tex | 141 +++++++++++++-------- 5 files changed, 128 insertions(+), 198 deletions(-) delete mode 100644 regression/code/gradientDescentPower.m diff --git a/regression/code/gradientDescent.m b/regression/code/gradientDescent.m index 1919035..48b976c 100644 --- a/regression/code/gradientDescent.m +++ b/regression/code/gradientDescent.m @@ -1,32 +1,42 @@ -% x, y from exercise 8.3 +function [p, ps, mses] = gradientDescent(x, y, func, p0, epsilon, threshold) +% Gradient descent for fitting a function to data pairs. +% +% Arguments: x, vector of the x-data values. +% y, vector of the corresponding y-data values. +% func, function handle func(x, p) +% p0, vector with initial parameter values +% epsilon: factor multiplying the gradient. +% threshold: minimum value for gradient +% +% Returns: p, vector with the final parameter values. +% ps: 2D-vector with all the parameter vectors traversed. +% mses: vector with the corresponding mean squared errors -% some arbitrary values for the slope and the intercept to start with: -position = [-2.0, 10.0]; + p = p0; + gradient = ones(1, length(p0)) * 1000.0; + ps = []; + mses = []; + while norm(gradient) > threshold + ps = [ps, p(:)]; + mses = [mses, meanSquaredError(x, y, func, p)]; + gradient = meanSquaredGradient(x, y, func, p); + p = p - epsilon * gradient; + end +end + +function mse = meanSquaredError(x, y, func, p) + mse = mean((y - func(x, p)).^2); +end -% gradient descent: -gradient = []; -errors = []; -count = 1; -eps = 0.0001; -while isempty(gradient) || norm(gradient) > 0.1 - gradient = meanSquaredGradient(x, y, position); - errors(count) = meanSquaredError(x, y, position); - position = position - eps .* gradient; - count = count + 1; +function gradmse = meanSquaredGradient(x, y, func, p) + gradmse = zeros(size(p, 1), size(p, 2)); + h = 1e-5; % stepsize for derivatives + mse = meanSquaredError(x, y, func, p); + for i = 1:length(p) % for each coordinate ... + pi = p; + pi(i) = pi(i) + h; % displace i-th parameter + msepi = meanSquaredError(x, y, func, pi); + gradmse(i) = (msepi - mse)/h; + end end -figure() -subplot(2,1,1) -hold on -scatter(x, y, 'displayname', 'data') -xx = min(x):0.01:max(x); -yy = position(1).*xx + position(2); -plot(xx, yy, 'displayname', 'fit') -xlabel('Input') -ylabel('Output') -grid on -legend show -subplot(2,1,2) -plot(errors) -xlabel('optimization steps') -ylabel('error') diff --git a/regression/code/gradientDescentPower.m b/regression/code/gradientDescentPower.m deleted file mode 100644 index 31390eb..0000000 --- a/regression/code/gradientDescentPower.m +++ /dev/null @@ -1,41 +0,0 @@ -function [p, ps, mses] = gradientDescentPower(x, y, p0, epsilon, threshold) -% Gradient descent for fitting a power-law. -% -% Arguments: x, vector of the x-data values. -% y, vector of the corresponding y-data values. -% p0, vector with initial values for c and alpha. -% epsilon: factor multiplying the gradient. -% threshold: minimum value for gradient -% -% Returns: p, vector with the final parameter values. -% ps: 2D-vector with all the parameter tuples traversed. -% mses: vector with the corresponding mean squared errors - - p = p0; - gradient = ones(1, length(p0)) * 1000.0; - ps = []; - mses = []; - while norm(gradient) > threshold - ps = [ps, p(:)]; - mses = [mses, meanSquaredErrorPower(x, y, p)]; - gradient = meanSquaredGradientPower(x, y, p); - p = p - epsilon * gradient; - end -end - -function mse = meanSquaredErrorPower(x, y, p) - mse = mean((y - p(1)*x.^p(2)).^2); -end - -function gradmse = meanSquaredGradientPower(x, y, p) - gradmse = zeros(size(p, 1), size(p, 2)); - h = 1e-5; % stepsize for derivatives - mse = meanSquaredErrorPower(x, y, p); - for i = 1:length(p) % for each coordinate ... - pi = p; - pi(i) = pi(i) + h; % displace i-th parameter - msepi = meanSquaredErrorPower(x, y, pi); - gradmse(i) = (msepi - mse)/h; - end -end - diff --git a/regression/code/plotgradientdescentpower.m b/regression/code/plotgradientdescentpower.m index f52bfe7..206e2bb 100644 --- a/regression/code/plotgradientdescentpower.m +++ b/regression/code/plotgradientdescentpower.m @@ -3,7 +3,7 @@ meansquarederrorline; % generate data p0 = [2.0, 1.0]; eps = 0.00001; thresh = 50.0; -[pest, ps, mses] = gradientDescentPower(x, y, p0, eps, thresh); +[pest, ps, mses] = gradientDescent(x, y, @powerLaw, p0, eps, thresh); pest subplot(2, 2, 1); % top left panel @@ -22,7 +22,7 @@ subplot(1, 2, 2); % right panel hold on; % generate x-values for plottig the fit: xx = min(x):0.01:max(x); -yy = pest(1) * xx.^pest(2); +yy = powerLaw(xx, pest); plot(xx, yy); plot(x, y, 'o'); % plot original data xlabel('Size [m]'); diff --git a/regression/lecture/regression-chapter.tex b/regression/lecture/regression-chapter.tex index 3983848..2ca1ec2 100644 --- a/regression/lecture/regression-chapter.tex +++ b/regression/lecture/regression-chapter.tex @@ -23,80 +23,6 @@ \item Fig 8.2 right: this should be a chi-squared distribution with one degree of freedom! \end{itemize} -\subsection{Start with one-dimensional problem!} -\begin{itemize} -\item How to plot a function (do not use the data x values!) -\item 1-d gradient descend -\item Describe in words the n-d problem (boltzman as example?). -\item Homework is to do the 2d problem with the straight line! -\item NO quiver plot (it is a nightmare to get this right) -\end{itemize} - -\subsection{2D fit} - -\begin{exercise}{errorSurface.m}{}\label{errorsurfaceexercise} - Generate 20 data pairs $(x_i|y_i)$ that are linearly related with - slope $m=0.75$ and intercept $b=-40$, using \varcode{rand()} for - drawing $x$ values between 0 and 120 and \varcode{randn()} for - jittering the $y$ values with a standard deviation of 15. Then - calculate the mean squared error between the data and straight lines - for a range of slopes and intercepts using the - \varcode{meanSquaredError()} function from the previous exercise. - Illustrates the error surface using the \code{surface()} function. - Consult the documentation to find out how to use \code{surface()}. -\end{exercise} - -\begin{exercise}{meanSquaredGradient.m}{}\label{gradientexercise}% - Implement a function \varcode{meanSquaredGradient()}, that takes the - $x$- and $y$-data and the set of parameters $(m, b)$ of a straight - line as a two-element vector as input arguments. The function should - return the gradient at the position $(m, b)$ as a vector with two - elements. -\end{exercise} - -\begin{exercise}{errorGradient.m}{} - Extend the script of exercises~\ref{errorsurfaceexercise} to plot - both the error surface and gradients using the - \varcode{meanSquaredGradient()} function from - exercise~\ref{gradientexercise}. Vectors in space can be easily - plotted using the function \code{quiver()}. Use \code{contour()} - instead of \code{surface()} to plot the error surface. -\end{exercise} - - -\begin{exercise}{gradientDescent.m}{} - Implement the gradient descent for the problem of fitting a straight - line to some measured data. Reuse the data generated in - exercise~\ref{errorsurfaceexercise}. - \begin{enumerate} - \item Store for each iteration the error value. - \item Plot the error values as a function of the iterations, the - number of optimization steps. - \item Plot the measured data together with the best fitting straight line. - \end{enumerate}\vspace{-4.5ex} -\end{exercise} - - -\begin{figure}[t] - \includegraphics[width=1\textwidth]{lin_regress}\hfill - \titlecaption{Example data suggesting a linear relation.}{A set of - input signals $x$, e.g. stimulus intensities, were used to probe a - system. The system's output $y$ to the inputs are noted - (left). Assuming a linear relation between $x$ and $y$ leaves us - with 2 parameters, the slope (center) and the intercept with the - y-axis (right panel).}\label{linregressiondatafig} -\end{figure} - -\begin{figure}[t] - \includegraphics[width=1\textwidth]{linear_least_squares} - \titlecaption{Estimating the \emph{mean square error}.} {The - deviation error (orange) between the prediction (red line) and the - observations (blue dots) is calculated for each data point - (left). Then the deviations are squared and the average is - calculated (right).} - \label{leastsquareerrorfig} -\end{figure} - \begin{figure}[t] \includegraphics[width=0.75\textwidth]{error_surface} \titlecaption{Error surface.}{The two model parameters $m$ and $b$ diff --git a/regression/lecture/regression.tex b/regression/lecture/regression.tex index 69e7b05..36e1808 100644 --- a/regression/lecture/regression.tex +++ b/regression/lecture/regression.tex @@ -479,49 +479,73 @@ the sum of the squared partial derivatives: \end{equation} The \code{norm()} function implements this. -\begin{exercise}{gradientDescentPower.m}{} + +\section{Passing a function as an argument to another function} + +So far, all our code for the gradient descent algorithm was tailored +to a specific function, the cubic relation \eqref{cubicfunc}. It would +be much better if we could pass an arbitrary function to our gradient +algorithm. Then we would not need to rewrite it every time anew. + +This is possible. We can indeed pass a function as an argument to +another function. For this use the \code{@}-operator. As an example +let's define a function that produces a standard plot for a function: + +\pageinputlisting[caption={Example function taking a function as argument.}]{funcPlotter.m} + +This function can then be used as follows for plotting a sine wave. We +pass the built in \varcode{sin()} function as \varcode{@sin} as an +argument to our function: + +\pageinputlisting[caption={Passing a function handle as an argument to a function.}]{funcplotterexamples.m} + + +\section{Gradient descent algorithm for arbitrary functions} + +Now we are ready to adapt the gradient descent algorithm from +exercise~\ref{gradientdescentexercise} to arbitrary functions with $n$ +parameters that we want to fit to some data. + +\begin{exercise}{gradientDescent.m}{} Adapt the function \varcode{gradientDescentCubic()} from exercise~\ref{gradientdescentexercise} to implement the gradient - descent algorithm for the power law \eqref{powerfunc}. The new - function takes a two element vector $(c,\alpha)$ for the initial - parameter values and also returns the best parameter combination as - a two-element vector. Use a \varcode{for} loop over the two - dimensions for computing the gradient. + descent algorithm for any function \varcode{func(x, p)} that takes + as first argument the $x$-data values and as second argument a + vector with parameter values. The new function takes a vector $\vec + p_0$ for the initial parameter values and also returns the best + parameter combination as a vector. Use a \varcode{for} loop over the + two dimensions for computing the gradient. \end{exercise} +For testing our new function we need to implement the power law +\eqref{powerfunc}: + +\begin{exercise}{powerLaw.m}{} + Write a function that implements \eqref{powerfunc}. The function + gets as arguments a vector $x$ containing the $x$-data values and + another vector containing as elements the parameters for the power + law, i.e. the factor $c$ and the power $\alpha$. It returns a vector + with the computed $y$ values for each $x$ value. +\end{exercise} + +Now let's use the new gradient descent function to fit a power law to +our tiger data-set: + \begin{exercise}{plotgradientdescentpower.m}{} - Use the function \varcode{gradientDescentPower()} to fit the - simulated data from exercise~\ref{mseexercise}. Plot the returned - values of the two parameters against each other. Compare the result - of the gradient descent method with the true values of $c$ and - $\alpha$ used to simulate the data. Observe the norm of the gradient - and inspect the plots to adapt $\epsilon$ (smaller than in + Use the function \varcode{gradientDescent()} to fit the + \varcode{powerLaw()} function to the simulated data from + exercise~\ref{mseexercise}. Plot the returned values of the two + parameters against each other. Compare the result of the gradient + descent method with the true values of $c$ and $\alpha$ used to + simulate the data. Observe the norm of the gradient and inspect the + plots to adapt $\epsilon$ (smaller than in exercise~\ref{plotgradientdescentexercise}) and the threshold (much larger) appropriately. Finally plot the data together with the best fitting power-law \eqref{powerfunc}. \end{exercise} -\section{Curve fit for arbitrary functions} - -So far, all our code for the gradient descent algorithm was tailored -to a specific function, the cubic relation \eqref{cubicfunc} or the -power law \eqref{powerfunc}. - -\section{XXX} -For example, you measure the response of a passive membrane to a -current step and you want to estimate membrane the time constant. Then you -need to fit an exponential function -\begin{equation} - \label{expfunc} - V(t; \tau, \Delta V, V_{\infty}) = \Delta V e^{-t/\tau} + V_{\infty} -\end{equation} -with three free parameters $\tau$, $\Delta y$, $y_{\infty}$ to the -measured time course of the membrane potential $V(t)$. The $(x_i,y_i)$ -data pairs are the sampling times $t_i$ and the corresponding -measurements of the membrane potential $V_i$. - -\section{Summary} +\section{Fitting non-linear functions to data} The gradient descent is an important numerical method for solving optimization problems. It is used to find the global minimum of an @@ -530,33 +554,44 @@ objective function. Curve fitting is a common application for the gradient descent method. For the case of fitting straight lines to data pairs, the error surface (using the mean squared error) has exactly one clearly defined -global minimum. In fact, the position of the minimum can be analytically -calculated as shown in the next chapter. - -Problems that involve nonlinear computations on parameters, e.g. the -rate $\lambda$ in an exponential function $f(x;\lambda) = e^{\lambda - x}$, do not have an analytical solution for the least squares. To -find the least squares for such functions numerical methods such as -the gradient descent have to be applied. - -The suggested gradient descent algorithm can be improved in multiple -ways to converge faster. For example one could adapt the step size to -the length of the gradient. These numerical tricks have already been -implemented in pre-defined functions. Generic optimization functions -such as \matlabfun{fminsearch()} have been implemented for arbitrary -objective functions, while the more specialized function -\matlabfun{lsqcurvefit()} i specifically designed for optimizations in -the least square error sense. - -%\newpage +global minimum. In fact, the position of the minimum can be +analytically calculated as shown in the next chapter. For linear +fitting problems numerical methods like the gradient descent are not +needed. + +Fitting problems that involve nonlinear functions of the parameters, +e.g. the power law \eqref{powerfunc} or the exponential function +$f(x;\lambda) = e^{\lambda x}$, do not have an analytical solution for +the least squares. To find the least squares for such functions +numerical methods such as the gradient descent have to be applied. + +The suggested gradient descent algorithm is quite fragile and requires +manually tuned values for $\epsilon$ and the threshold for terminating +the iteration. The algorithm can be improved in multiple ways to +converge more robustly and faster. For example one could adapt the +step size to the length of the gradient. These numerical tricks have +already been implemented in pre-defined functions. Generic +optimization functions such as \mcode{fminsearch()} have been +implemented for arbitrary objective functions, while the more +specialized function \mcode{lsqcurvefit()} is specifically designed +for optimizations in the least square error sense. + +\begin{exercise}{plotlsqcurvefitpower.m}{} + Use the \matlab-function \varcode{lsqcurvefit()} instead of + \varcode{gradientDescent()} to fit the \varcode{powerLaw()} function + to the simulated data from exercise~\ref{mseexercise}. Plot the data + and the resulting best fitting power law function. +\end{exercise} + + \begin{important}[Beware of secondary minima!] Finding the absolute minimum is not always as easy as in the case of - fitting a straight line. Often, the error surface has secondary or + fitting a straight line. Often, the cost function has secondary or local minima in which the gradient descent stops even though there is a more optimal solution, a global minimum that is lower than the local minimum. Starting from good initial positions is a good approach to avoid getting stuck in local minima. Also keep in mind - that error surfaces tend to be simpler (less local minima) the fewer + that cost functions tend to be simpler (less local minima) the fewer parameters are fitted from the data. Each additional parameter increases complexity and is computationally more expensive. \end{important}