[regression] finished main text and exercises
This commit is contained in:
parent
bfb2f66de2
commit
46affef86d
@ -1,32 +1,42 @@
|
|||||||
% x, y from exercise 8.3
|
function [p, ps, mses] = gradientDescent(x, y, func, p0, epsilon, threshold)
|
||||||
|
% Gradient descent for fitting a function to data pairs.
|
||||||
|
%
|
||||||
|
% Arguments: x, vector of the x-data values.
|
||||||
|
% y, vector of the corresponding y-data values.
|
||||||
|
% func, function handle func(x, p)
|
||||||
|
% p0, vector with initial parameter values
|
||||||
|
% epsilon: factor multiplying the gradient.
|
||||||
|
% threshold: minimum value for gradient
|
||||||
|
%
|
||||||
|
% Returns: p, vector with the final parameter values.
|
||||||
|
% ps: 2D-vector with all the parameter vectors traversed.
|
||||||
|
% mses: vector with the corresponding mean squared errors
|
||||||
|
|
||||||
% some arbitrary values for the slope and the intercept to start with:
|
p = p0;
|
||||||
position = [-2.0, 10.0];
|
gradient = ones(1, length(p0)) * 1000.0;
|
||||||
|
ps = [];
|
||||||
|
mses = [];
|
||||||
|
while norm(gradient) > threshold
|
||||||
|
ps = [ps, p(:)];
|
||||||
|
mses = [mses, meanSquaredError(x, y, func, p)];
|
||||||
|
gradient = meanSquaredGradient(x, y, func, p);
|
||||||
|
p = p - epsilon * gradient;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
% gradient descent:
|
function mse = meanSquaredError(x, y, func, p)
|
||||||
gradient = [];
|
mse = mean((y - func(x, p)).^2);
|
||||||
errors = [];
|
end
|
||||||
count = 1;
|
|
||||||
eps = 0.0001;
|
function gradmse = meanSquaredGradient(x, y, func, p)
|
||||||
while isempty(gradient) || norm(gradient) > 0.1
|
gradmse = zeros(size(p, 1), size(p, 2));
|
||||||
gradient = meanSquaredGradient(x, y, position);
|
h = 1e-5; % stepsize for derivatives
|
||||||
errors(count) = meanSquaredError(x, y, position);
|
mse = meanSquaredError(x, y, func, p);
|
||||||
position = position - eps .* gradient;
|
for i = 1:length(p) % for each coordinate ...
|
||||||
count = count + 1;
|
pi = p;
|
||||||
|
pi(i) = pi(i) + h; % displace i-th parameter
|
||||||
|
msepi = meanSquaredError(x, y, func, pi);
|
||||||
|
gradmse(i) = (msepi - mse)/h;
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
figure()
|
|
||||||
subplot(2,1,1)
|
|
||||||
hold on
|
|
||||||
scatter(x, y, 'displayname', 'data')
|
|
||||||
xx = min(x):0.01:max(x);
|
|
||||||
yy = position(1).*xx + position(2);
|
|
||||||
plot(xx, yy, 'displayname', 'fit')
|
|
||||||
xlabel('Input')
|
|
||||||
ylabel('Output')
|
|
||||||
grid on
|
|
||||||
legend show
|
|
||||||
subplot(2,1,2)
|
|
||||||
plot(errors)
|
|
||||||
xlabel('optimization steps')
|
|
||||||
ylabel('error')
|
|
||||||
|
@ -1,41 +0,0 @@
|
|||||||
function [p, ps, mses] = gradientDescentPower(x, y, p0, epsilon, threshold)
|
|
||||||
% Gradient descent for fitting a power-law.
|
|
||||||
%
|
|
||||||
% Arguments: x, vector of the x-data values.
|
|
||||||
% y, vector of the corresponding y-data values.
|
|
||||||
% p0, vector with initial values for c and alpha.
|
|
||||||
% epsilon: factor multiplying the gradient.
|
|
||||||
% threshold: minimum value for gradient
|
|
||||||
%
|
|
||||||
% Returns: p, vector with the final parameter values.
|
|
||||||
% ps: 2D-vector with all the parameter tuples traversed.
|
|
||||||
% mses: vector with the corresponding mean squared errors
|
|
||||||
|
|
||||||
p = p0;
|
|
||||||
gradient = ones(1, length(p0)) * 1000.0;
|
|
||||||
ps = [];
|
|
||||||
mses = [];
|
|
||||||
while norm(gradient) > threshold
|
|
||||||
ps = [ps, p(:)];
|
|
||||||
mses = [mses, meanSquaredErrorPower(x, y, p)];
|
|
||||||
gradient = meanSquaredGradientPower(x, y, p);
|
|
||||||
p = p - epsilon * gradient;
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
function mse = meanSquaredErrorPower(x, y, p)
|
|
||||||
mse = mean((y - p(1)*x.^p(2)).^2);
|
|
||||||
end
|
|
||||||
|
|
||||||
function gradmse = meanSquaredGradientPower(x, y, p)
|
|
||||||
gradmse = zeros(size(p, 1), size(p, 2));
|
|
||||||
h = 1e-5; % stepsize for derivatives
|
|
||||||
mse = meanSquaredErrorPower(x, y, p);
|
|
||||||
for i = 1:length(p) % for each coordinate ...
|
|
||||||
pi = p;
|
|
||||||
pi(i) = pi(i) + h; % displace i-th parameter
|
|
||||||
msepi = meanSquaredErrorPower(x, y, pi);
|
|
||||||
gradmse(i) = (msepi - mse)/h;
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
@ -3,7 +3,7 @@ meansquarederrorline; % generate data
|
|||||||
p0 = [2.0, 1.0];
|
p0 = [2.0, 1.0];
|
||||||
eps = 0.00001;
|
eps = 0.00001;
|
||||||
thresh = 50.0;
|
thresh = 50.0;
|
||||||
[pest, ps, mses] = gradientDescentPower(x, y, p0, eps, thresh);
|
[pest, ps, mses] = gradientDescent(x, y, @powerLaw, p0, eps, thresh);
|
||||||
pest
|
pest
|
||||||
|
|
||||||
subplot(2, 2, 1); % top left panel
|
subplot(2, 2, 1); % top left panel
|
||||||
@ -22,7 +22,7 @@ subplot(1, 2, 2); % right panel
|
|||||||
hold on;
|
hold on;
|
||||||
% generate x-values for plottig the fit:
|
% generate x-values for plottig the fit:
|
||||||
xx = min(x):0.01:max(x);
|
xx = min(x):0.01:max(x);
|
||||||
yy = pest(1) * xx.^pest(2);
|
yy = powerLaw(xx, pest);
|
||||||
plot(xx, yy);
|
plot(xx, yy);
|
||||||
plot(x, y, 'o'); % plot original data
|
plot(x, y, 'o'); % plot original data
|
||||||
xlabel('Size [m]');
|
xlabel('Size [m]');
|
||||||
|
@ -23,80 +23,6 @@
|
|||||||
\item Fig 8.2 right: this should be a chi-squared distribution with one degree of freedom!
|
\item Fig 8.2 right: this should be a chi-squared distribution with one degree of freedom!
|
||||||
\end{itemize}
|
\end{itemize}
|
||||||
|
|
||||||
\subsection{Start with one-dimensional problem!}
|
|
||||||
\begin{itemize}
|
|
||||||
\item How to plot a function (do not use the data x values!)
|
|
||||||
\item 1-d gradient descend
|
|
||||||
\item Describe in words the n-d problem (boltzman as example?).
|
|
||||||
\item Homework is to do the 2d problem with the straight line!
|
|
||||||
\item NO quiver plot (it is a nightmare to get this right)
|
|
||||||
\end{itemize}
|
|
||||||
|
|
||||||
\subsection{2D fit}
|
|
||||||
|
|
||||||
\begin{exercise}{errorSurface.m}{}\label{errorsurfaceexercise}
|
|
||||||
Generate 20 data pairs $(x_i|y_i)$ that are linearly related with
|
|
||||||
slope $m=0.75$ and intercept $b=-40$, using \varcode{rand()} for
|
|
||||||
drawing $x$ values between 0 and 120 and \varcode{randn()} for
|
|
||||||
jittering the $y$ values with a standard deviation of 15. Then
|
|
||||||
calculate the mean squared error between the data and straight lines
|
|
||||||
for a range of slopes and intercepts using the
|
|
||||||
\varcode{meanSquaredError()} function from the previous exercise.
|
|
||||||
Illustrates the error surface using the \code{surface()} function.
|
|
||||||
Consult the documentation to find out how to use \code{surface()}.
|
|
||||||
\end{exercise}
|
|
||||||
|
|
||||||
\begin{exercise}{meanSquaredGradient.m}{}\label{gradientexercise}%
|
|
||||||
Implement a function \varcode{meanSquaredGradient()}, that takes the
|
|
||||||
$x$- and $y$-data and the set of parameters $(m, b)$ of a straight
|
|
||||||
line as a two-element vector as input arguments. The function should
|
|
||||||
return the gradient at the position $(m, b)$ as a vector with two
|
|
||||||
elements.
|
|
||||||
\end{exercise}
|
|
||||||
|
|
||||||
\begin{exercise}{errorGradient.m}{}
|
|
||||||
Extend the script of exercises~\ref{errorsurfaceexercise} to plot
|
|
||||||
both the error surface and gradients using the
|
|
||||||
\varcode{meanSquaredGradient()} function from
|
|
||||||
exercise~\ref{gradientexercise}. Vectors in space can be easily
|
|
||||||
plotted using the function \code{quiver()}. Use \code{contour()}
|
|
||||||
instead of \code{surface()} to plot the error surface.
|
|
||||||
\end{exercise}
|
|
||||||
|
|
||||||
|
|
||||||
\begin{exercise}{gradientDescent.m}{}
|
|
||||||
Implement the gradient descent for the problem of fitting a straight
|
|
||||||
line to some measured data. Reuse the data generated in
|
|
||||||
exercise~\ref{errorsurfaceexercise}.
|
|
||||||
\begin{enumerate}
|
|
||||||
\item Store for each iteration the error value.
|
|
||||||
\item Plot the error values as a function of the iterations, the
|
|
||||||
number of optimization steps.
|
|
||||||
\item Plot the measured data together with the best fitting straight line.
|
|
||||||
\end{enumerate}\vspace{-4.5ex}
|
|
||||||
\end{exercise}
|
|
||||||
|
|
||||||
|
|
||||||
\begin{figure}[t]
|
|
||||||
\includegraphics[width=1\textwidth]{lin_regress}\hfill
|
|
||||||
\titlecaption{Example data suggesting a linear relation.}{A set of
|
|
||||||
input signals $x$, e.g. stimulus intensities, were used to probe a
|
|
||||||
system. The system's output $y$ to the inputs are noted
|
|
||||||
(left). Assuming a linear relation between $x$ and $y$ leaves us
|
|
||||||
with 2 parameters, the slope (center) and the intercept with the
|
|
||||||
y-axis (right panel).}\label{linregressiondatafig}
|
|
||||||
\end{figure}
|
|
||||||
|
|
||||||
\begin{figure}[t]
|
|
||||||
\includegraphics[width=1\textwidth]{linear_least_squares}
|
|
||||||
\titlecaption{Estimating the \emph{mean square error}.} {The
|
|
||||||
deviation error (orange) between the prediction (red line) and the
|
|
||||||
observations (blue dots) is calculated for each data point
|
|
||||||
(left). Then the deviations are squared and the average is
|
|
||||||
calculated (right).}
|
|
||||||
\label{leastsquareerrorfig}
|
|
||||||
\end{figure}
|
|
||||||
|
|
||||||
\begin{figure}[t]
|
\begin{figure}[t]
|
||||||
\includegraphics[width=0.75\textwidth]{error_surface}
|
\includegraphics[width=0.75\textwidth]{error_surface}
|
||||||
\titlecaption{Error surface.}{The two model parameters $m$ and $b$
|
\titlecaption{Error surface.}{The two model parameters $m$ and $b$
|
||||||
|
@ -479,49 +479,73 @@ the sum of the squared partial derivatives:
|
|||||||
\end{equation}
|
\end{equation}
|
||||||
The \code{norm()} function implements this.
|
The \code{norm()} function implements this.
|
||||||
|
|
||||||
\begin{exercise}{gradientDescentPower.m}{}
|
|
||||||
|
\section{Passing a function as an argument to another function}
|
||||||
|
|
||||||
|
So far, all our code for the gradient descent algorithm was tailored
|
||||||
|
to a specific function, the cubic relation \eqref{cubicfunc}. It would
|
||||||
|
be much better if we could pass an arbitrary function to our gradient
|
||||||
|
algorithm. Then we would not need to rewrite it every time anew.
|
||||||
|
|
||||||
|
This is possible. We can indeed pass a function as an argument to
|
||||||
|
another function. For this use the \code{@}-operator. As an example
|
||||||
|
let's define a function that produces a standard plot for a function:
|
||||||
|
|
||||||
|
\pageinputlisting[caption={Example function taking a function as argument.}]{funcPlotter.m}
|
||||||
|
|
||||||
|
This function can then be used as follows for plotting a sine wave. We
|
||||||
|
pass the built in \varcode{sin()} function as \varcode{@sin} as an
|
||||||
|
argument to our function:
|
||||||
|
|
||||||
|
\pageinputlisting[caption={Passing a function handle as an argument to a function.}]{funcplotterexamples.m}
|
||||||
|
|
||||||
|
|
||||||
|
\section{Gradient descent algorithm for arbitrary functions}
|
||||||
|
|
||||||
|
Now we are ready to adapt the gradient descent algorithm from
|
||||||
|
exercise~\ref{gradientdescentexercise} to arbitrary functions with $n$
|
||||||
|
parameters that we want to fit to some data.
|
||||||
|
|
||||||
|
\begin{exercise}{gradientDescent.m}{}
|
||||||
Adapt the function \varcode{gradientDescentCubic()} from
|
Adapt the function \varcode{gradientDescentCubic()} from
|
||||||
exercise~\ref{gradientdescentexercise} to implement the gradient
|
exercise~\ref{gradientdescentexercise} to implement the gradient
|
||||||
descent algorithm for the power law \eqref{powerfunc}. The new
|
descent algorithm for any function \varcode{func(x, p)} that takes
|
||||||
function takes a two element vector $(c,\alpha)$ for the initial
|
as first argument the $x$-data values and as second argument a
|
||||||
parameter values and also returns the best parameter combination as
|
vector with parameter values. The new function takes a vector $\vec
|
||||||
a two-element vector. Use a \varcode{for} loop over the two
|
p_0$ for the initial parameter values and also returns the best
|
||||||
dimensions for computing the gradient.
|
parameter combination as a vector. Use a \varcode{for} loop over the
|
||||||
|
two dimensions for computing the gradient.
|
||||||
\end{exercise}
|
\end{exercise}
|
||||||
|
|
||||||
|
For testing our new function we need to implement the power law
|
||||||
|
\eqref{powerfunc}:
|
||||||
|
|
||||||
|
\begin{exercise}{powerLaw.m}{}
|
||||||
|
Write a function that implements \eqref{powerfunc}. The function
|
||||||
|
gets as arguments a vector $x$ containing the $x$-data values and
|
||||||
|
another vector containing as elements the parameters for the power
|
||||||
|
law, i.e. the factor $c$ and the power $\alpha$. It returns a vector
|
||||||
|
with the computed $y$ values for each $x$ value.
|
||||||
|
\end{exercise}
|
||||||
|
|
||||||
|
Now let's use the new gradient descent function to fit a power law to
|
||||||
|
our tiger data-set:
|
||||||
|
|
||||||
\begin{exercise}{plotgradientdescentpower.m}{}
|
\begin{exercise}{plotgradientdescentpower.m}{}
|
||||||
Use the function \varcode{gradientDescentPower()} to fit the
|
Use the function \varcode{gradientDescent()} to fit the
|
||||||
simulated data from exercise~\ref{mseexercise}. Plot the returned
|
\varcode{powerLaw()} function to the simulated data from
|
||||||
values of the two parameters against each other. Compare the result
|
exercise~\ref{mseexercise}. Plot the returned values of the two
|
||||||
of the gradient descent method with the true values of $c$ and
|
parameters against each other. Compare the result of the gradient
|
||||||
$\alpha$ used to simulate the data. Observe the norm of the gradient
|
descent method with the true values of $c$ and $\alpha$ used to
|
||||||
and inspect the plots to adapt $\epsilon$ (smaller than in
|
simulate the data. Observe the norm of the gradient and inspect the
|
||||||
|
plots to adapt $\epsilon$ (smaller than in
|
||||||
exercise~\ref{plotgradientdescentexercise}) and the threshold (much
|
exercise~\ref{plotgradientdescentexercise}) and the threshold (much
|
||||||
larger) appropriately. Finally plot the data together with the best
|
larger) appropriately. Finally plot the data together with the best
|
||||||
fitting power-law \eqref{powerfunc}.
|
fitting power-law \eqref{powerfunc}.
|
||||||
\end{exercise}
|
\end{exercise}
|
||||||
|
|
||||||
|
|
||||||
\section{Curve fit for arbitrary functions}
|
\section{Fitting non-linear functions to data}
|
||||||
|
|
||||||
So far, all our code for the gradient descent algorithm was tailored
|
|
||||||
to a specific function, the cubic relation \eqref{cubicfunc} or the
|
|
||||||
power law \eqref{powerfunc}.
|
|
||||||
|
|
||||||
\section{XXX}
|
|
||||||
For example, you measure the response of a passive membrane to a
|
|
||||||
current step and you want to estimate membrane the time constant. Then you
|
|
||||||
need to fit an exponential function
|
|
||||||
\begin{equation}
|
|
||||||
\label{expfunc}
|
|
||||||
V(t; \tau, \Delta V, V_{\infty}) = \Delta V e^{-t/\tau} + V_{\infty}
|
|
||||||
\end{equation}
|
|
||||||
with three free parameters $\tau$, $\Delta y$, $y_{\infty}$ to the
|
|
||||||
measured time course of the membrane potential $V(t)$. The $(x_i,y_i)$
|
|
||||||
data pairs are the sampling times $t_i$ and the corresponding
|
|
||||||
measurements of the membrane potential $V_i$.
|
|
||||||
|
|
||||||
\section{Summary}
|
|
||||||
|
|
||||||
The gradient descent is an important numerical method for solving
|
The gradient descent is an important numerical method for solving
|
||||||
optimization problems. It is used to find the global minimum of an
|
optimization problems. It is used to find the global minimum of an
|
||||||
@ -530,33 +554,44 @@ objective function.
|
|||||||
Curve fitting is a common application for the gradient descent method.
|
Curve fitting is a common application for the gradient descent method.
|
||||||
For the case of fitting straight lines to data pairs, the error
|
For the case of fitting straight lines to data pairs, the error
|
||||||
surface (using the mean squared error) has exactly one clearly defined
|
surface (using the mean squared error) has exactly one clearly defined
|
||||||
global minimum. In fact, the position of the minimum can be analytically
|
global minimum. In fact, the position of the minimum can be
|
||||||
calculated as shown in the next chapter.
|
analytically calculated as shown in the next chapter. For linear
|
||||||
|
fitting problems numerical methods like the gradient descent are not
|
||||||
Problems that involve nonlinear computations on parameters, e.g. the
|
needed.
|
||||||
rate $\lambda$ in an exponential function $f(x;\lambda) = e^{\lambda
|
|
||||||
x}$, do not have an analytical solution for the least squares. To
|
Fitting problems that involve nonlinear functions of the parameters,
|
||||||
find the least squares for such functions numerical methods such as
|
e.g. the power law \eqref{powerfunc} or the exponential function
|
||||||
the gradient descent have to be applied.
|
$f(x;\lambda) = e^{\lambda x}$, do not have an analytical solution for
|
||||||
|
the least squares. To find the least squares for such functions
|
||||||
The suggested gradient descent algorithm can be improved in multiple
|
numerical methods such as the gradient descent have to be applied.
|
||||||
ways to converge faster. For example one could adapt the step size to
|
|
||||||
the length of the gradient. These numerical tricks have already been
|
The suggested gradient descent algorithm is quite fragile and requires
|
||||||
implemented in pre-defined functions. Generic optimization functions
|
manually tuned values for $\epsilon$ and the threshold for terminating
|
||||||
such as \matlabfun{fminsearch()} have been implemented for arbitrary
|
the iteration. The algorithm can be improved in multiple ways to
|
||||||
objective functions, while the more specialized function
|
converge more robustly and faster. For example one could adapt the
|
||||||
\matlabfun{lsqcurvefit()} i specifically designed for optimizations in
|
step size to the length of the gradient. These numerical tricks have
|
||||||
the least square error sense.
|
already been implemented in pre-defined functions. Generic
|
||||||
|
optimization functions such as \mcode{fminsearch()} have been
|
||||||
%\newpage
|
implemented for arbitrary objective functions, while the more
|
||||||
|
specialized function \mcode{lsqcurvefit()} is specifically designed
|
||||||
|
for optimizations in the least square error sense.
|
||||||
|
|
||||||
|
\begin{exercise}{plotlsqcurvefitpower.m}{}
|
||||||
|
Use the \matlab-function \varcode{lsqcurvefit()} instead of
|
||||||
|
\varcode{gradientDescent()} to fit the \varcode{powerLaw()} function
|
||||||
|
to the simulated data from exercise~\ref{mseexercise}. Plot the data
|
||||||
|
and the resulting best fitting power law function.
|
||||||
|
\end{exercise}
|
||||||
|
|
||||||
|
|
||||||
\begin{important}[Beware of secondary minima!]
|
\begin{important}[Beware of secondary minima!]
|
||||||
Finding the absolute minimum is not always as easy as in the case of
|
Finding the absolute minimum is not always as easy as in the case of
|
||||||
fitting a straight line. Often, the error surface has secondary or
|
fitting a straight line. Often, the cost function has secondary or
|
||||||
local minima in which the gradient descent stops even though there
|
local minima in which the gradient descent stops even though there
|
||||||
is a more optimal solution, a global minimum that is lower than the
|
is a more optimal solution, a global minimum that is lower than the
|
||||||
local minimum. Starting from good initial positions is a good
|
local minimum. Starting from good initial positions is a good
|
||||||
approach to avoid getting stuck in local minima. Also keep in mind
|
approach to avoid getting stuck in local minima. Also keep in mind
|
||||||
that error surfaces tend to be simpler (less local minima) the fewer
|
that cost functions tend to be simpler (less local minima) the fewer
|
||||||
parameters are fitted from the data. Each additional parameter
|
parameters are fitted from the data. Each additional parameter
|
||||||
increases complexity and is computationally more expensive.
|
increases complexity and is computationally more expensive.
|
||||||
\end{important}
|
\end{important}
|
||||||
|
Reference in New Issue
Block a user