[regression] finished main text and exercises

This commit is contained in:
Jan Benda 2020-12-19 22:56:09 +01:00
parent bfb2f66de2
commit 46affef86d
5 changed files with 128 additions and 198 deletions

View File

@ -1,32 +1,42 @@
% x, y from exercise 8.3
function [p, ps, mses] = gradientDescent(x, y, func, p0, epsilon, threshold)
% Gradient descent for fitting a function to data pairs.
%
% Arguments: x, vector of the x-data values.
% y, vector of the corresponding y-data values.
% func, function handle func(x, p)
% p0, vector with initial parameter values
% epsilon: factor multiplying the gradient.
% threshold: minimum value for gradient
%
% Returns: p, vector with the final parameter values.
% ps: 2D-vector with all the parameter vectors traversed.
% mses: vector with the corresponding mean squared errors
% some arbitrary values for the slope and the intercept to start with:
position = [-2.0, 10.0];
p = p0;
gradient = ones(1, length(p0)) * 1000.0;
ps = [];
mses = [];
while norm(gradient) > threshold
ps = [ps, p(:)];
mses = [mses, meanSquaredError(x, y, func, p)];
gradient = meanSquaredGradient(x, y, func, p);
p = p - epsilon * gradient;
end
end
function mse = meanSquaredError(x, y, func, p)
mse = mean((y - func(x, p)).^2);
end
% gradient descent:
gradient = [];
errors = [];
count = 1;
eps = 0.0001;
while isempty(gradient) || norm(gradient) > 0.1
gradient = meanSquaredGradient(x, y, position);
errors(count) = meanSquaredError(x, y, position);
position = position - eps .* gradient;
count = count + 1;
function gradmse = meanSquaredGradient(x, y, func, p)
gradmse = zeros(size(p, 1), size(p, 2));
h = 1e-5; % stepsize for derivatives
mse = meanSquaredError(x, y, func, p);
for i = 1:length(p) % for each coordinate ...
pi = p;
pi(i) = pi(i) + h; % displace i-th parameter
msepi = meanSquaredError(x, y, func, pi);
gradmse(i) = (msepi - mse)/h;
end
end
figure()
subplot(2,1,1)
hold on
scatter(x, y, 'displayname', 'data')
xx = min(x):0.01:max(x);
yy = position(1).*xx + position(2);
plot(xx, yy, 'displayname', 'fit')
xlabel('Input')
ylabel('Output')
grid on
legend show
subplot(2,1,2)
plot(errors)
xlabel('optimization steps')
ylabel('error')

View File

@ -1,41 +0,0 @@
function [p, ps, mses] = gradientDescentPower(x, y, p0, epsilon, threshold)
% Gradient descent for fitting a power-law.
%
% Arguments: x, vector of the x-data values.
% y, vector of the corresponding y-data values.
% p0, vector with initial values for c and alpha.
% epsilon: factor multiplying the gradient.
% threshold: minimum value for gradient
%
% Returns: p, vector with the final parameter values.
% ps: 2D-vector with all the parameter tuples traversed.
% mses: vector with the corresponding mean squared errors
p = p0;
gradient = ones(1, length(p0)) * 1000.0;
ps = [];
mses = [];
while norm(gradient) > threshold
ps = [ps, p(:)];
mses = [mses, meanSquaredErrorPower(x, y, p)];
gradient = meanSquaredGradientPower(x, y, p);
p = p - epsilon * gradient;
end
end
function mse = meanSquaredErrorPower(x, y, p)
mse = mean((y - p(1)*x.^p(2)).^2);
end
function gradmse = meanSquaredGradientPower(x, y, p)
gradmse = zeros(size(p, 1), size(p, 2));
h = 1e-5; % stepsize for derivatives
mse = meanSquaredErrorPower(x, y, p);
for i = 1:length(p) % for each coordinate ...
pi = p;
pi(i) = pi(i) + h; % displace i-th parameter
msepi = meanSquaredErrorPower(x, y, pi);
gradmse(i) = (msepi - mse)/h;
end
end

View File

@ -3,7 +3,7 @@ meansquarederrorline; % generate data
p0 = [2.0, 1.0];
eps = 0.00001;
thresh = 50.0;
[pest, ps, mses] = gradientDescentPower(x, y, p0, eps, thresh);
[pest, ps, mses] = gradientDescent(x, y, @powerLaw, p0, eps, thresh);
pest
subplot(2, 2, 1); % top left panel
@ -22,7 +22,7 @@ subplot(1, 2, 2); % right panel
hold on;
% generate x-values for plottig the fit:
xx = min(x):0.01:max(x);
yy = pest(1) * xx.^pest(2);
yy = powerLaw(xx, pest);
plot(xx, yy);
plot(x, y, 'o'); % plot original data
xlabel('Size [m]');

View File

@ -23,80 +23,6 @@
\item Fig 8.2 right: this should be a chi-squared distribution with one degree of freedom!
\end{itemize}
\subsection{Start with one-dimensional problem!}
\begin{itemize}
\item How to plot a function (do not use the data x values!)
\item 1-d gradient descend
\item Describe in words the n-d problem (boltzman as example?).
\item Homework is to do the 2d problem with the straight line!
\item NO quiver plot (it is a nightmare to get this right)
\end{itemize}
\subsection{2D fit}
\begin{exercise}{errorSurface.m}{}\label{errorsurfaceexercise}
Generate 20 data pairs $(x_i|y_i)$ that are linearly related with
slope $m=0.75$ and intercept $b=-40$, using \varcode{rand()} for
drawing $x$ values between 0 and 120 and \varcode{randn()} for
jittering the $y$ values with a standard deviation of 15. Then
calculate the mean squared error between the data and straight lines
for a range of slopes and intercepts using the
\varcode{meanSquaredError()} function from the previous exercise.
Illustrates the error surface using the \code{surface()} function.
Consult the documentation to find out how to use \code{surface()}.
\end{exercise}
\begin{exercise}{meanSquaredGradient.m}{}\label{gradientexercise}%
Implement a function \varcode{meanSquaredGradient()}, that takes the
$x$- and $y$-data and the set of parameters $(m, b)$ of a straight
line as a two-element vector as input arguments. The function should
return the gradient at the position $(m, b)$ as a vector with two
elements.
\end{exercise}
\begin{exercise}{errorGradient.m}{}
Extend the script of exercises~\ref{errorsurfaceexercise} to plot
both the error surface and gradients using the
\varcode{meanSquaredGradient()} function from
exercise~\ref{gradientexercise}. Vectors in space can be easily
plotted using the function \code{quiver()}. Use \code{contour()}
instead of \code{surface()} to plot the error surface.
\end{exercise}
\begin{exercise}{gradientDescent.m}{}
Implement the gradient descent for the problem of fitting a straight
line to some measured data. Reuse the data generated in
exercise~\ref{errorsurfaceexercise}.
\begin{enumerate}
\item Store for each iteration the error value.
\item Plot the error values as a function of the iterations, the
number of optimization steps.
\item Plot the measured data together with the best fitting straight line.
\end{enumerate}\vspace{-4.5ex}
\end{exercise}
\begin{figure}[t]
\includegraphics[width=1\textwidth]{lin_regress}\hfill
\titlecaption{Example data suggesting a linear relation.}{A set of
input signals $x$, e.g. stimulus intensities, were used to probe a
system. The system's output $y$ to the inputs are noted
(left). Assuming a linear relation between $x$ and $y$ leaves us
with 2 parameters, the slope (center) and the intercept with the
y-axis (right panel).}\label{linregressiondatafig}
\end{figure}
\begin{figure}[t]
\includegraphics[width=1\textwidth]{linear_least_squares}
\titlecaption{Estimating the \emph{mean square error}.} {The
deviation error (orange) between the prediction (red line) and the
observations (blue dots) is calculated for each data point
(left). Then the deviations are squared and the average is
calculated (right).}
\label{leastsquareerrorfig}
\end{figure}
\begin{figure}[t]
\includegraphics[width=0.75\textwidth]{error_surface}
\titlecaption{Error surface.}{The two model parameters $m$ and $b$

View File

@ -479,49 +479,73 @@ the sum of the squared partial derivatives:
\end{equation}
The \code{norm()} function implements this.
\begin{exercise}{gradientDescentPower.m}{}
\section{Passing a function as an argument to another function}
So far, all our code for the gradient descent algorithm was tailored
to a specific function, the cubic relation \eqref{cubicfunc}. It would
be much better if we could pass an arbitrary function to our gradient
algorithm. Then we would not need to rewrite it every time anew.
This is possible. We can indeed pass a function as an argument to
another function. For this use the \code{@}-operator. As an example
let's define a function that produces a standard plot for a function:
\pageinputlisting[caption={Example function taking a function as argument.}]{funcPlotter.m}
This function can then be used as follows for plotting a sine wave. We
pass the built in \varcode{sin()} function as \varcode{@sin} as an
argument to our function:
\pageinputlisting[caption={Passing a function handle as an argument to a function.}]{funcplotterexamples.m}
\section{Gradient descent algorithm for arbitrary functions}
Now we are ready to adapt the gradient descent algorithm from
exercise~\ref{gradientdescentexercise} to arbitrary functions with $n$
parameters that we want to fit to some data.
\begin{exercise}{gradientDescent.m}{}
Adapt the function \varcode{gradientDescentCubic()} from
exercise~\ref{gradientdescentexercise} to implement the gradient
descent algorithm for the power law \eqref{powerfunc}. The new
function takes a two element vector $(c,\alpha)$ for the initial
parameter values and also returns the best parameter combination as
a two-element vector. Use a \varcode{for} loop over the two
dimensions for computing the gradient.
descent algorithm for any function \varcode{func(x, p)} that takes
as first argument the $x$-data values and as second argument a
vector with parameter values. The new function takes a vector $\vec
p_0$ for the initial parameter values and also returns the best
parameter combination as a vector. Use a \varcode{for} loop over the
two dimensions for computing the gradient.
\end{exercise}
For testing our new function we need to implement the power law
\eqref{powerfunc}:
\begin{exercise}{powerLaw.m}{}
Write a function that implements \eqref{powerfunc}. The function
gets as arguments a vector $x$ containing the $x$-data values and
another vector containing as elements the parameters for the power
law, i.e. the factor $c$ and the power $\alpha$. It returns a vector
with the computed $y$ values for each $x$ value.
\end{exercise}
Now let's use the new gradient descent function to fit a power law to
our tiger data-set:
\begin{exercise}{plotgradientdescentpower.m}{}
Use the function \varcode{gradientDescentPower()} to fit the
simulated data from exercise~\ref{mseexercise}. Plot the returned
values of the two parameters against each other. Compare the result
of the gradient descent method with the true values of $c$ and
$\alpha$ used to simulate the data. Observe the norm of the gradient
and inspect the plots to adapt $\epsilon$ (smaller than in
Use the function \varcode{gradientDescent()} to fit the
\varcode{powerLaw()} function to the simulated data from
exercise~\ref{mseexercise}. Plot the returned values of the two
parameters against each other. Compare the result of the gradient
descent method with the true values of $c$ and $\alpha$ used to
simulate the data. Observe the norm of the gradient and inspect the
plots to adapt $\epsilon$ (smaller than in
exercise~\ref{plotgradientdescentexercise}) and the threshold (much
larger) appropriately. Finally plot the data together with the best
fitting power-law \eqref{powerfunc}.
\end{exercise}
\section{Curve fit for arbitrary functions}
So far, all our code for the gradient descent algorithm was tailored
to a specific function, the cubic relation \eqref{cubicfunc} or the
power law \eqref{powerfunc}.
\section{XXX}
For example, you measure the response of a passive membrane to a
current step and you want to estimate membrane the time constant. Then you
need to fit an exponential function
\begin{equation}
\label{expfunc}
V(t; \tau, \Delta V, V_{\infty}) = \Delta V e^{-t/\tau} + V_{\infty}
\end{equation}
with three free parameters $\tau$, $\Delta y$, $y_{\infty}$ to the
measured time course of the membrane potential $V(t)$. The $(x_i,y_i)$
data pairs are the sampling times $t_i$ and the corresponding
measurements of the membrane potential $V_i$.
\section{Summary}
\section{Fitting non-linear functions to data}
The gradient descent is an important numerical method for solving
optimization problems. It is used to find the global minimum of an
@ -530,33 +554,44 @@ objective function.
Curve fitting is a common application for the gradient descent method.
For the case of fitting straight lines to data pairs, the error
surface (using the mean squared error) has exactly one clearly defined
global minimum. In fact, the position of the minimum can be analytically
calculated as shown in the next chapter.
Problems that involve nonlinear computations on parameters, e.g. the
rate $\lambda$ in an exponential function $f(x;\lambda) = e^{\lambda
x}$, do not have an analytical solution for the least squares. To
find the least squares for such functions numerical methods such as
the gradient descent have to be applied.
The suggested gradient descent algorithm can be improved in multiple
ways to converge faster. For example one could adapt the step size to
the length of the gradient. These numerical tricks have already been
implemented in pre-defined functions. Generic optimization functions
such as \matlabfun{fminsearch()} have been implemented for arbitrary
objective functions, while the more specialized function
\matlabfun{lsqcurvefit()} i specifically designed for optimizations in
the least square error sense.
%\newpage
global minimum. In fact, the position of the minimum can be
analytically calculated as shown in the next chapter. For linear
fitting problems numerical methods like the gradient descent are not
needed.
Fitting problems that involve nonlinear functions of the parameters,
e.g. the power law \eqref{powerfunc} or the exponential function
$f(x;\lambda) = e^{\lambda x}$, do not have an analytical solution for
the least squares. To find the least squares for such functions
numerical methods such as the gradient descent have to be applied.
The suggested gradient descent algorithm is quite fragile and requires
manually tuned values for $\epsilon$ and the threshold for terminating
the iteration. The algorithm can be improved in multiple ways to
converge more robustly and faster. For example one could adapt the
step size to the length of the gradient. These numerical tricks have
already been implemented in pre-defined functions. Generic
optimization functions such as \mcode{fminsearch()} have been
implemented for arbitrary objective functions, while the more
specialized function \mcode{lsqcurvefit()} is specifically designed
for optimizations in the least square error sense.
\begin{exercise}{plotlsqcurvefitpower.m}{}
Use the \matlab-function \varcode{lsqcurvefit()} instead of
\varcode{gradientDescent()} to fit the \varcode{powerLaw()} function
to the simulated data from exercise~\ref{mseexercise}. Plot the data
and the resulting best fitting power law function.
\end{exercise}
\begin{important}[Beware of secondary minima!]
Finding the absolute minimum is not always as easy as in the case of
fitting a straight line. Often, the error surface has secondary or
fitting a straight line. Often, the cost function has secondary or
local minima in which the gradient descent stops even though there
is a more optimal solution, a global minimum that is lower than the
local minimum. Starting from good initial positions is a good
approach to avoid getting stuck in local minima. Also keep in mind
that error surfaces tend to be simpler (less local minima) the fewer
that cost functions tend to be simpler (less local minima) the fewer
parameters are fitted from the data. Each additional parameter
increases complexity and is computationally more expensive.
\end{important}